diff --git a/package.json b/package.json index 3ec166b9d544934e29719e5455eeac1465ebada7..5b38029bc4793ba096108e7bcc9c582f336ff1b2 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "ddd-indexer", - "version": "0.0.1", + "version": "0.0.2", "private": true, "type": "module", "scripts": { diff --git a/scripts/docker-build.sh b/scripts/docker-build.sh index 2940396bb9d3022644badc01033defc54ecda9fd..cc5ca3609cf4de749d752aadae10392850d9dee7 100755 --- a/scripts/docker-build.sh +++ b/scripts/docker-build.sh @@ -15,16 +15,16 @@ docker image tag duniter-datapod h30x/duniter-datapod:latest docker image push h30x/duniter-datapod:$version_tag docker image push h30x/duniter-datapod:latest -# --- kubo -docker buildx build -f Dockerfile.Kubo -t datapod-kubo . +# # --- kubo +# docker buildx build -f Dockerfile.Kubo -t datapod-kubo . -# Tag with version and 'latest' -docker image tag datapod-kubo h30x/datapod-kubo:$version_tag -docker image tag datapod-kubo h30x/datapod-kubo:latest +# # Tag with version and 'latest' +# docker image tag datapod-kubo h30x/datapod-kubo:$version_tag +# docker image tag datapod-kubo h30x/datapod-kubo:latest -# Push both -docker image push h30x/datapod-kubo:$version_tag -docker image push h30x/datapod-kubo:latest +# # Push both +# docker image push h30x/datapod-kubo:$version_tag +# docker image push h30x/datapod-kubo:latest # --- hasura docker buildx build -f Dockerfile.Hasura -t datapod-hasura . diff --git a/src/indexer/handlers.ts b/src/indexer/handlers.ts index 58ce1e15b85de9c8d2821519125411d004a38d1a..05e0933ad9bf3ab2e5f2856129b06a621aab54e9 100644 --- a/src/indexer/handlers.ts +++ b/src/indexer/handlers.ts @@ -173,30 +173,35 @@ export async function takeFromDiffQueue(): Promise<void> { // compute diff with remote tree and add index requests to merge queue // TODO prevent computing diff from several peers at the same time, otherwise CIDs will be added multiple times export async function computeDiff(fromCID: CID, toCID: CID): Promise<void> { - // if they differ - if (fromCID.toString() != toCID.toString()) { - console.log(`👐 computing diff from ${fromCID} to ${toCID}`) - // iterate over all index requests of diff - const iterator = getDiff(fromCID, toCID) - let num = 0 - for await (const leaf of iterator) { - for (let irCID of leaf) { - // add it to the process queue to be added in the tree - GLOB_mergeQueue.push(['key', irCID]) // FIXME get key while iterating - num += 1 - } - // make sure that collection is triggered regularly - if (num > BATCH_SIZE) { - num = 0 - events.emit(evtype.triggerCollect) - // This is a hack to limit injestion of new data and let time to process all - // for 100 000 documents with a batch size of 1000 and 3 seconds, it is adding 5 minutes overall - await setTimeout(3000) // 3 sec + try { + // if they differ + if (fromCID.toString() != toCID.toString()) { + console.log(`👐 computing diff from ${fromCID} to ${toCID}`) + // iterate over all index requests of diff + const iterator = getDiff(fromCID, toCID) + let num = 0 + for await (const leaf of iterator) { + for (let irCID of leaf) { + // add it to the process queue to be added in the tree + GLOB_mergeQueue.push(['key', irCID]) // FIXME get key while iterating + num += 1 + } + // make sure that collection is triggered regularly + if (num > BATCH_SIZE) { + num = 0 + events.emit(evtype.triggerCollect) + // This is a hack to limit injestion of new data and let time to process all + // for 100 000 documents with a batch size of 1000 and 3 seconds, it is adding 5 minutes overall + await setTimeout(3000) // 3 sec + } } + events.emit(evtype.triggerCollect) + } else { + console.log(`👌 already at ${toCID}`) } - events.emit(evtype.triggerCollect) - } else { - console.log(`👌 already at ${toCID}`) + } catch (e) { + // compare code does some assertions about the tree format + console.log('could not compute diff from ' + fromCID + ' to ' + toCID + ' due to error: ', e) } } diff --git a/src/interface.ts b/src/interface.ts index a582fccc015614ff3a364911552fd349aa71aa15..cac7bdf433de9ef83f463dcbbdf6f2d6a980a626 100644 --- a/src/interface.ts +++ b/src/interface.ts @@ -70,7 +70,7 @@ async function* getDiffInodes(inode1: IndexInode, inode2: IndexInode): AsyncIter } if (ri == null) { // left is not null and was added, ignore - console.log('ignoring removed data ' + li) + console.log(`ignoring removed data at ${ctx}${li[0]}: ${li[1]}`) continue } @@ -121,106 +121,106 @@ async function* getDiffInodes(inode1: IndexInode, inode2: IndexInode): AsyncIter } } -// recursive comparison of inodes -// the differences are added asynchronously in "addedLeft" and "addedRight" arrays -// meaning that these arrays will still grow after the function returns -export async function compareInodes( - // key of compared inodes, used as termination condition to know if we reached the leaf - k: string, - // reference node for comparison - left: IndexInode, - // node to compare with reference - right: IndexInode, - // list of nodes in left not present in right, (key,value) pairs - addedLeft: Array<[string, CID]>, - // list of nodes in right not present in left, (key,value) pairs - addedRight: Array<[string, CID]> -) { - // termination condition, since we know the size of the key - if (k.length == KEYSIZE) { - console.log('comparing leaf ' + k) - compareLeafs(k, left as unknown as IndexLeaf, right as unknown as IndexLeaf, addedLeft, addedRight) - return - } - // console.log('comparing node ' + k) +// // recursive comparison of inodes +// // the differences are added asynchronously in "addedLeft" and "addedRight" arrays +// // meaning that these arrays will still grow after the function returns +// export async function compareInodes( +// // key of compared inodes, used as termination condition to know if we reached the leaf +// k: string, +// // reference node for comparison +// left: IndexInode, +// // node to compare with reference +// right: IndexInode, +// // list of nodes in left not present in right, (key,value) pairs +// addedLeft: Array<[string, CID]>, +// // list of nodes in right not present in left, (key,value) pairs +// addedRight: Array<[string, CID]> +// ) { +// // termination condition, since we know the size of the key +// if (k.length == KEYSIZE) { +// console.log('comparing leaf ' + k) +// compareLeafs(k, left as unknown as IndexLeaf, right as unknown as IndexLeaf, addedLeft, addedRight) +// return +// } +// // console.log('comparing node ' + k) - // iterate over nodes children - for (let i = 0; i < BASE; i++) { - // get left and right entries - const li = left.children[i] - const ri = right.children[i] - if (li == null && ri == null) { - // do not compare if they are both null - continue - } - if (li == null) { - // right is not null and was added - const nk = k + ri![0] - console.log('added right ' + nk) - addedRight.push([k, ri![1]]) - continue - } - if (ri == null) { - // left is not null and was added - console.log('added left') - const nk = k + li![0] - addedLeft.push([nk, li![1]]) - continue - } +// // iterate over nodes children +// for (let i = 0; i < BASE; i++) { +// // get left and right entries +// const li = left.children[i] +// const ri = right.children[i] +// if (li == null && ri == null) { +// // do not compare if they are both null +// continue +// } +// if (li == null) { +// // right is not null and was added +// const nk = k + ri![0] +// console.log('added right ' + nk) +// addedRight.push([k, ri![1]]) +// continue +// } +// if (ri == null) { +// // left is not null and was added +// console.log('added left') +// const nk = k + li![0] +// addedLeft.push([nk, li![1]]) +// continue +// } - // both buckets have items, unstructure them to get key and value - const [lik, lic] = li - const [rik, ric] = ri - if (lic.toString() == ric.toString()) { - // do not compare if the cid is the same - console.log('same ' + k + lik) - continue - } +// // both buckets have items, unstructure them to get key and value +// const [lik, lic] = li +// const [rik, ric] = ri +// if (lic.toString() == ric.toString()) { +// // do not compare if the cid is the same +// console.log('same ' + k + lik) +// continue +// } - if (lik == rik) { - // keys are the same and only content changed, dig deeper in both - const nk = k + lik - Promise.all([kubo.dag.get(lic), kubo.dag.get(ric)]).then((r) => { - const [lin, rin] = r - compareInodes(nk, lin.value, rin.value, addedLeft, addedRight) - }) - continue - } +// if (lik == rik) { +// // keys are the same and only content changed, dig deeper in both +// const nk = k + lik +// Promise.all([kubo.dag.get(lic), kubo.dag.get(ric)]).then((r) => { +// const [lin, rin] = r +// compareInodes(nk, lin.value, rin.value, addedLeft, addedRight) +// }) +// continue +// } - // there is a key diff, we have to compare - if (lik.length > rik.length && lik.startsWith(rik)) { - const nk = k + rik - console.log('diff ' + nk) - // intermediate inode might have been added to the right - // create virtual node then dig deeper in right - const lvnode = emptyInode(nk) - const b = parseInt(lik[rik.length], BASE) - lvnode.children[b] = [lik.slice(rik.length), lic] - kubo.dag.get(ric).then((r) => { - compareInodes(nk, lvnode, r.value, addedLeft, addedRight) - }) - continue - } - if (lik.length < rik.length && rik.startsWith(lik)) { - const nk = k + lik - console.log('diff ' + nk) - // intermediate inode might have been added to the left - // create virtual node then dig deeper in left - const rvnode = emptyInode(nk) - const b = parseInt(rik[lik.length], BASE) - rvnode.children[b] = [rik, ric] - kubo.dag.get(ric).then((l) => { - compareInodes(nk, l.value, rvnode, addedLeft, addedRight) - }) - continue - } - // keys do not cover the same time period - // content is then completely different - addedLeft.push([k + lik, lic]) - addedRight.push([k + rik, ric]) - continue - } -} +// // there is a key diff, we have to compare +// if (lik.length > rik.length && lik.startsWith(rik)) { +// const nk = k + rik +// console.log('diff ' + nk) +// // intermediate inode might have been added to the right +// // create virtual node then dig deeper in right +// const lvnode = emptyInode(nk) +// const b = parseInt(lik[rik.length], BASE) +// lvnode.children[b] = [lik.slice(rik.length), lic] +// kubo.dag.get(ric).then((r) => { +// compareInodes(nk, lvnode, r.value, addedLeft, addedRight) +// }) +// continue +// } +// if (lik.length < rik.length && rik.startsWith(lik)) { +// const nk = k + lik +// console.log('diff ' + nk) +// // intermediate inode might have been added to the left +// // create virtual node then dig deeper in left +// const rvnode = emptyInode(nk) +// const b = parseInt(rik[lik.length], BASE) +// rvnode.children[b] = [rik, ric] +// kubo.dag.get(ric).then((l) => { +// compareInodes(nk, l.value, rvnode, addedLeft, addedRight) +// }) +// continue +// } +// // keys do not cover the same time period +// // content is then completely different +// addedLeft.push([k + lik, lic]) +// addedRight.push([k + rik, ric]) +// continue +// } +// } // compare leaves by "eating" cids one by one // based on the assumption that leaves are sorted diff --git a/src/processor.ts b/src/processor.ts index 86ac55cc5f588501560c092701c4eacc4e28eb7b..773eb53f951fb1e33493dab89d3b1a70bdac8ead 100644 --- a/src/processor.ts +++ b/src/processor.ts @@ -1,9 +1,9 @@ import { CID } from 'multiformats' import { kubo } from './kubo' import { BASE, KEYSIZE } from './consts' -import { emptyInode, emptyVinode, emptyLeaf } from './types' -import type { IndexInode, IndexLeaf, IndexVinode, IndexRequest, IndexHist } from './types' -import { DD_TAMT_HIST_OPT, DD_TAMT_OPT, ddKeys } from './indexer/ipns' +import { emptyInode, emptyVinode } from './types' +import type { IndexInode, IndexLeaf, IndexVinode, IndexHist } from './types' +import { DD_TAMT_HIST_OPT, ddKeys } from './indexer/ipns' import { uniqueby } from './utils' import assert from 'assert' @@ -248,8 +248,7 @@ export async function mergeInodesSync( throw Error('should not be possible, are keys same size?') } // if we are not yet at the leaf level, we can safely assume that nodeA and nodeB are indeed inodes - const noda = nodeA as IndexInode - const nodb = nodeB as IndexVinode + const [noda, nodb] = [nodeA as IndexInode, nodeB as IndexVinode] assert(noda.ctx == nodb.ctx) const ctx = noda.ctx @@ -320,6 +319,7 @@ export async function mergeInodesSync( } else { // nBcb is null, keep nAcb bucket untouched (might be null or not) if (nAcb == null) continue + // if nAcb is non null, we have to get its item count const [_kA, childA] = nAcb itemCount += await getItemCount(childA) // suboptimal too continue @@ -327,6 +327,7 @@ export async function mergeInodesSync( } // now that we have the new node, we can upload it and return its cid noda.count = itemCount + // console.debug(itemCount) const nodaCID = kubo.dag.put(noda).then((cid) => { // WIP pinning does not work well // kubo.pin.add(cid).catch((e) => console.log(`📌📌 could not pin newly merged node ${cid}`)) @@ -399,6 +400,7 @@ export async function concretizeCid(node: IndexVinode | IndexLeaf): Promise<[num return Promise.all([nodeL.leaf.length, kubo.dag.put(node) as Promise<CID>]) } const newNode = await concretize(node as IndexVinode) + // console.debug(newNode.count) return Promise.all([newNode.count, kubo.dag.put(newNode) as Promise<CID>]) } @@ -415,11 +417,15 @@ async function concretize(node: IndexVinode): Promise<IndexInode> { return [k, cid as CID] }) }) - return { - children: await Promise.all(childrenPromise), + // note: we must await before referencing the variable itemCount in return value + const awaitedChildrens = await Promise.all(childrenPromise) + // console.debug(itemCount) + const concretizedInode: IndexInode = { + children: awaitedChildrens, count: itemCount, ctx: node.ctx } + return concretizedInode } /// get item count contained in a given CID @@ -431,6 +437,7 @@ async function getItemCount(cid: CID): Promise<number> { return nodeL.leaf.length } if (nodeN.children) { + // console.debug(nodeN.count) return nodeN.count } throw Error('can not get item count of this object: ' + cid) diff --git a/src/scripts/diff.ts b/src/scripts/diff.ts index ebdfe54ee18111d75d3d68af69095b1bc2797485..0faeae040afa50639dfe51fb94a8bbc35d3feb5f 100644 --- a/src/scripts/diff.ts +++ b/src/scripts/diff.ts @@ -5,9 +5,11 @@ import { setTimeout } from 'timers/promises' console.log('start') -const fromCID = EMPTY_NODE_CID +// const fromCID = EMPTY_NODE_CID // const fromCID = CID.parse('bafyreic6qy2k5w6324uayfzoybmzypdv57zk3zxezaiws4h553jjogw6o4') -const toCID = CID.parse('bafyreihls2kmwx2ufuwx4kbl67f3ipl5wbc6j6snfegy3sttymrhxsgvpa') +// const toCID = CID.parse('bafyreihls2kmwx2ufuwx4kbl67f3ipl5wbc6j6snfegy3sttymrhxsgvpa') +const fromCID = CID.parse('bafyreiaixvejxrcszzexohdo5obtduw5mctvti2jsgob4pnq7ovd5ngrxi') +const toCID = CID.parse('bafyreiel7fh42ehswlh7wg4mz5zzrugwbpuevzojxuzw3dwgyoiyhvjhma') const iterator = getDiff(fromCID, toCID)