diff --git a/README.md b/README.md index b79f88024990a28ac675ddb073219331d83c99c7..4bd14332cc8fee8016561c907395cb715bff8cb0 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ pnpm dev After exporting the data to json files with the Rust script from `v2s-datapod`. ```sh +# doImport() # takes about 200 seconds (4 minutes) time npx tsx src/scripts/cesium-plus-import.ts ``` @@ -22,4 +23,11 @@ You can then manually pin the cid according to the command output. ipfs pin add -r bafyreie74jtf23zzz2tdgsz7axfrm4pidje43ypqn25v4gkdtfjbcj62km ``` -This will make easier to insert this data in any AMT or other data structure. \ No newline at end of file +This will make easier to insert this data in any AMT or other data structure. + +```sh +# doMergeAMT() +# takes about 50 seconds +time npx tsx src/scripts/cesium-plus-import.ts +# bafyreie23z6aayg5pjrqiowwziv2zt55b3ijzch3bjf5u4ebfbjtl5raxe +``` diff --git a/src/cesium-plus.ts b/src/cesium-plus.ts index 6f250354b34f181859f74c9eb2eb872b0cd4ae63..41dd7980808f690c9f04e0d395dd5f25377f5357 100644 --- a/src/cesium-plus.ts +++ b/src/cesium-plus.ts @@ -1,8 +1,16 @@ import { CID } from 'multiformats' import { kubo } from './kubo' import { Buffer } from 'buffer' -import { insertKnownIndexRequest } from './processor' -import type { IndexRequest, Pointer } from './types' +import { + insertKnownIndexRequest, + timestampToKey, + bucket, + compareKey, + type resultType, + type diffResult +} from './processor' +import { emptyInode, type IndexLeaf, type IndexRequest, type IndexVinode, type Pointer } from './types' +import { BASE, KEYSIZE } from './consts' // for reference see // https://doc.e-is.pro/cesium-plus-pod/REST_API.html @@ -58,7 +66,7 @@ export async function processCesiumPlusProfile(obj: CplusProfile): Promise<CID> } } -// import these cid to target AMT +// import these cid to target AMT, naive approach one by one and asynchronous export async function importCplusToAMT(cplusCID: CID, amtCid: Pointer<CID>) { const cplusroot = await kubo.dag.get(cplusCID) for (let chunkcid of cplusroot.value) { @@ -106,3 +114,65 @@ export async function importCplusToAMTSync(cplusCID: CID, amtCid: CID): Promise< } return amtCid } + +// build virtual AMT from cesium root CID to prepare merge + +/// convert array of key/value pairs to virtual inode (tree) +// opti: use a dichotomy search instead of iterating over all elements +export function arrayToVinode(array: Array<[string, CID]>): IndexVinode { + const node = emptyInode() as IndexVinode + for (let b = 0; b < BASE; b++) { + const subArray: Array<[string, CID]> = [] + do { + const elt = array.shift() + if (elt == undefined) { + break + } + const k = elt[0] + if (bucket(k) != b) { + array.unshift(elt) + break + } + subArray.push(elt) + } while (true) + if (subArray.length > 0) { + // not empty + const k1 = subArray.at(0)![0] + const k2 = subArray.at(-1)![0] + if (k1 == k2) { + node.children[b] = [k1, arrayToLeaf(subArray.map(([k, v]) => v))] + continue + } + const c = compareKey(k1, k2) as diffResult + const minimalSubArray: Array<[string, CID]> = subArray.map(([k, v]) => [k.slice(c.common.length), v]) + node.children[b] = [c.common, arrayToVinode(minimalSubArray)] + } + } + return node +} + +export function arrayToLeaf(array: CID[]): IndexLeaf { + return { leaf: array.sort((a, b) => (a.toString() < b.toString() ? -1 : 1)) } +} + +// sort all cids and convert timestamp to key +export function sortCids(allCIDs: Array<[number, CID]>): Array<[string, CID]> { + allCIDs.sort() + return allCIDs.map(([t, c]) => [timestampToKey(t), c]) +} + +// retreive all CIDs +export async function allCplusCids(cplusCID: CID): Promise<Array<[number, CID]>> { + console.log(Date.now() + ' getting all cplus data') + const allCIDs: Array<Promise<[number, CID]>> = [] + const cplusroot = await kubo.dag.get(cplusCID) + for (let chunkcid of cplusroot.value) { + const chunk = await kubo.dag.get(chunkcid) + for (let pcid of chunk.value) { + const p = kubo.dag.get(pcid) + const profile: Promise<[number, CID]> = p.then((v) => [v.value.time, pcid]) + allCIDs.push(profile) + } + } + return Promise.all(allCIDs) +} diff --git a/src/processor.ts b/src/processor.ts index 8f6596ed6e21869557883853cecd11132ac97f21..06ce664ffbed205e84156cfdd4fccc6fb76c1a5f 100644 --- a/src/processor.ts +++ b/src/processor.ts @@ -12,7 +12,7 @@ import { } from './types' // convert timestamp to key -function timestampToKey(timestamp: number): string { +export function timestampToKey(timestamp: number): string { return timestamp.toString(BASE).padStart(KEYSIZE, '0') } @@ -100,7 +100,7 @@ export interface ProcessFunction { } // return bucket corresponding to given letter -function bucket(letter: string): number { +export function bucket(letter: string): number { return parseInt(letter[0], BASE) } diff --git a/src/scripts/cesium-plus-import.ts b/src/scripts/cesium-plus-import.ts index f3b3067be53405497fee3e286f3e8b12fdc79885..5a5575792e7a45a632d4eca543a2960cae63675d 100644 --- a/src/scripts/cesium-plus-import.ts +++ b/src/scripts/cesium-plus-import.ts @@ -1,6 +1,16 @@ import { CID } from 'multiformats' -import { processCesiumPlusImport, processCesiumPlusProfile, importCplusToAMTSync } from '../cesium-plus' +import { + processCesiumPlusImport, + processCesiumPlusProfile, + importCplusToAMTSync, + allCplusCids, + sortCids, + arrayToVinode +} from '../cesium-plus' import * as fs from 'fs/promises' +import { kubo } from '../kubo' +import type { IndexInode } from '@/types' +import { mergeInodesSync } from '../processor' // profile files // const PROFILES = '/home/hugo/ipfs/v2s-datapod/migrate_csplus/profile_csplus.json' @@ -33,4 +43,27 @@ function doImportToAMT() { importCplusToAMTSync(cplus, amt).then(console.log) } -doImportToAMT() \ No newline at end of file +// this is a more optimized version that takes 50 seconds to import all 50000 profiles → 1000 profiles per second +async function doMergeAMT() { + const cplus = CID.parse('bafyreie74jtf23zzz2tdgsz7axfrm4pidje43ypqn25v4gkdtfjbcj62km') // cesium plus import + const amt = CID.parse('bafyreicvlp2p65agkxpzcboedba7zit55us4zvtyyq2wesvsdedy6irwfy') // empty root cid + + const rootNode: IndexInode = (await kubo.dag.get(amt)).value + + allCplusCids(cplus) + .then(sortCids) + .then((all) => { + console.log(Date.now() + ' converting to virtual tree ') + return arrayToVinode(all) + }) + .then((inode) => { + console.log(Date.now() + ' merging') + return mergeInodesSync(rootNode, inode) + }) + .then((cid) => { + console.log(Date.now() + ' finished') + console.log(cid) + }) +} + +doMergeAMT() diff --git a/src/types.ts b/src/types.ts index 43e2e2980fdbafcb3540beb6bf910e20bd89d47a..bf1d2f5339d4dbeeaffb1912f2eb81cdafec8d97 100644 --- a/src/types.ts +++ b/src/types.ts @@ -26,8 +26,6 @@ export interface IndexInode { // same as IndexInode but mutable and only in memory // allows to process batch insert without having to change all intermediate nodes (including root node) each time export interface IndexVinode { - // can be a total new index inode (null) or be already aware of its old CID - cid: null | CID // same as IndexInode children: (null | [string, IndexVinode | IndexLeaf])[] }