From d0cfe89ffe5ad7cf00881cf752c109e25d57628c Mon Sep 17 00:00:00 2001 From: Hugo Trentesaux <hugo@trentesaux.fr> Date: Fri, 5 Apr 2024 20:33:25 +0200 Subject: [PATCH] do c+ import --- README.md | 10 +++- src/cesium-plus.ts | 76 +++++++++++++++++++++++++++++-- src/processor.ts | 4 +- src/scripts/cesium-plus-import.ts | 37 ++++++++++++++- src/types.ts | 2 - 5 files changed, 119 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index b79f880..4bd1433 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ pnpm dev After exporting the data to json files with the Rust script from `v2s-datapod`. ```sh +# doImport() # takes about 200 seconds (4 minutes) time npx tsx src/scripts/cesium-plus-import.ts ``` @@ -22,4 +23,11 @@ You can then manually pin the cid according to the command output. ipfs pin add -r bafyreie74jtf23zzz2tdgsz7axfrm4pidje43ypqn25v4gkdtfjbcj62km ``` -This will make easier to insert this data in any AMT or other data structure. \ No newline at end of file +This will make easier to insert this data in any AMT or other data structure. + +```sh +# doMergeAMT() +# takes about 50 seconds +time npx tsx src/scripts/cesium-plus-import.ts +# bafyreie23z6aayg5pjrqiowwziv2zt55b3ijzch3bjf5u4ebfbjtl5raxe +``` diff --git a/src/cesium-plus.ts b/src/cesium-plus.ts index 6f25035..41dd798 100644 --- a/src/cesium-plus.ts +++ b/src/cesium-plus.ts @@ -1,8 +1,16 @@ import { CID } from 'multiformats' import { kubo } from './kubo' import { Buffer } from 'buffer' -import { insertKnownIndexRequest } from './processor' -import type { IndexRequest, Pointer } from './types' +import { + insertKnownIndexRequest, + timestampToKey, + bucket, + compareKey, + type resultType, + type diffResult +} from './processor' +import { emptyInode, type IndexLeaf, type IndexRequest, type IndexVinode, type Pointer } from './types' +import { BASE, KEYSIZE } from './consts' // for reference see // https://doc.e-is.pro/cesium-plus-pod/REST_API.html @@ -58,7 +66,7 @@ export async function processCesiumPlusProfile(obj: CplusProfile): Promise<CID> } } -// import these cid to target AMT +// import these cid to target AMT, naive approach one by one and asynchronous export async function importCplusToAMT(cplusCID: CID, amtCid: Pointer<CID>) { const cplusroot = await kubo.dag.get(cplusCID) for (let chunkcid of cplusroot.value) { @@ -106,3 +114,65 @@ export async function importCplusToAMTSync(cplusCID: CID, amtCid: CID): Promise< } return amtCid } + +// build virtual AMT from cesium root CID to prepare merge + +/// convert array of key/value pairs to virtual inode (tree) +// opti: use a dichotomy search instead of iterating over all elements +export function arrayToVinode(array: Array<[string, CID]>): IndexVinode { + const node = emptyInode() as IndexVinode + for (let b = 0; b < BASE; b++) { + const subArray: Array<[string, CID]> = [] + do { + const elt = array.shift() + if (elt == undefined) { + break + } + const k = elt[0] + if (bucket(k) != b) { + array.unshift(elt) + break + } + subArray.push(elt) + } while (true) + if (subArray.length > 0) { + // not empty + const k1 = subArray.at(0)![0] + const k2 = subArray.at(-1)![0] + if (k1 == k2) { + node.children[b] = [k1, arrayToLeaf(subArray.map(([k, v]) => v))] + continue + } + const c = compareKey(k1, k2) as diffResult + const minimalSubArray: Array<[string, CID]> = subArray.map(([k, v]) => [k.slice(c.common.length), v]) + node.children[b] = [c.common, arrayToVinode(minimalSubArray)] + } + } + return node +} + +export function arrayToLeaf(array: CID[]): IndexLeaf { + return { leaf: array.sort((a, b) => (a.toString() < b.toString() ? -1 : 1)) } +} + +// sort all cids and convert timestamp to key +export function sortCids(allCIDs: Array<[number, CID]>): Array<[string, CID]> { + allCIDs.sort() + return allCIDs.map(([t, c]) => [timestampToKey(t), c]) +} + +// retreive all CIDs +export async function allCplusCids(cplusCID: CID): Promise<Array<[number, CID]>> { + console.log(Date.now() + ' getting all cplus data') + const allCIDs: Array<Promise<[number, CID]>> = [] + const cplusroot = await kubo.dag.get(cplusCID) + for (let chunkcid of cplusroot.value) { + const chunk = await kubo.dag.get(chunkcid) + for (let pcid of chunk.value) { + const p = kubo.dag.get(pcid) + const profile: Promise<[number, CID]> = p.then((v) => [v.value.time, pcid]) + allCIDs.push(profile) + } + } + return Promise.all(allCIDs) +} diff --git a/src/processor.ts b/src/processor.ts index 8f6596e..06ce664 100644 --- a/src/processor.ts +++ b/src/processor.ts @@ -12,7 +12,7 @@ import { } from './types' // convert timestamp to key -function timestampToKey(timestamp: number): string { +export function timestampToKey(timestamp: number): string { return timestamp.toString(BASE).padStart(KEYSIZE, '0') } @@ -100,7 +100,7 @@ export interface ProcessFunction { } // return bucket corresponding to given letter -function bucket(letter: string): number { +export function bucket(letter: string): number { return parseInt(letter[0], BASE) } diff --git a/src/scripts/cesium-plus-import.ts b/src/scripts/cesium-plus-import.ts index f3b3067..5a55757 100644 --- a/src/scripts/cesium-plus-import.ts +++ b/src/scripts/cesium-plus-import.ts @@ -1,6 +1,16 @@ import { CID } from 'multiformats' -import { processCesiumPlusImport, processCesiumPlusProfile, importCplusToAMTSync } from '../cesium-plus' +import { + processCesiumPlusImport, + processCesiumPlusProfile, + importCplusToAMTSync, + allCplusCids, + sortCids, + arrayToVinode +} from '../cesium-plus' import * as fs from 'fs/promises' +import { kubo } from '../kubo' +import type { IndexInode } from '@/types' +import { mergeInodesSync } from '../processor' // profile files // const PROFILES = '/home/hugo/ipfs/v2s-datapod/migrate_csplus/profile_csplus.json' @@ -33,4 +43,27 @@ function doImportToAMT() { importCplusToAMTSync(cplus, amt).then(console.log) } -doImportToAMT() \ No newline at end of file +// this is a more optimized version that takes 50 seconds to import all 50000 profiles → 1000 profiles per second +async function doMergeAMT() { + const cplus = CID.parse('bafyreie74jtf23zzz2tdgsz7axfrm4pidje43ypqn25v4gkdtfjbcj62km') // cesium plus import + const amt = CID.parse('bafyreicvlp2p65agkxpzcboedba7zit55us4zvtyyq2wesvsdedy6irwfy') // empty root cid + + const rootNode: IndexInode = (await kubo.dag.get(amt)).value + + allCplusCids(cplus) + .then(sortCids) + .then((all) => { + console.log(Date.now() + ' converting to virtual tree ') + return arrayToVinode(all) + }) + .then((inode) => { + console.log(Date.now() + ' merging') + return mergeInodesSync(rootNode, inode) + }) + .then((cid) => { + console.log(Date.now() + ' finished') + console.log(cid) + }) +} + +doMergeAMT() diff --git a/src/types.ts b/src/types.ts index 43e2e29..bf1d2f5 100644 --- a/src/types.ts +++ b/src/types.ts @@ -26,8 +26,6 @@ export interface IndexInode { // same as IndexInode but mutable and only in memory // allows to process batch insert without having to change all intermediate nodes (including root node) each time export interface IndexVinode { - // can be a total new index inode (null) or be already aware of its old CID - cid: null | CID // same as IndexInode children: (null | [string, IndexVinode | IndexLeaf])[] } -- GitLab