From 858cdf8afec2dbdc240ac4934b6c0d4a61611b2c Mon Sep 17 00:00:00 2001 From: Hugo Trentesaux <hugo@trentesaux.fr> Date: Fri, 5 Apr 2024 16:40:45 +0200 Subject: [PATCH] prepare merge algo for C+ import --- src/processor.ts | 145 +++++++++++++++++++++++++++++++++++++++++------ src/types.ts | 10 ++++ 2 files changed, 139 insertions(+), 16 deletions(-) diff --git a/src/processor.ts b/src/processor.ts index 5eee07a..8f6596e 100644 --- a/src/processor.ts +++ b/src/processor.ts @@ -1,7 +1,20 @@ import { CID } from 'multiformats' import { kubo } from './kubo' import { IPNS, BASE, IPNS_HIST, KEYSIZE } from './consts' -import { type IndexInode, emptyInode, type IndexLeaf, type IndexRequest, emptyLeaf, type IndexHist } from './types' +import { + type IndexInode, + emptyInode, + type IndexLeaf, + emptyLeaf, + type IndexVinode, + type IndexRequest, + type IndexHist +} from './types' + +// convert timestamp to key +function timestampToKey(timestamp: number): string { + return timestamp.toString(BASE).padStart(KEYSIZE, '0') +} // add cid to index queue export async function addToIndexQueue(cid: CID, indexRequest: IndexRequest) { @@ -23,21 +36,38 @@ export async function addToIndexQueue(cid: CID, indexRequest: IndexRequest) { } } +// returns a process function suitable for processInode that simply inserts request in a leaf +function insertRequest(indexRequestCid: CID): ProcessFunction { + return (maybeLeaf) => { + if (maybeLeaf == null) { + // in this case we want to create a new leaf + return processLeaf(emptyLeaf(), indexRequestCid) + } else { + // in this case we want to insert indexRequestCid in existing leaf + return processLeaf(maybeLeaf, indexRequestCid) + } + } +} + // simplest way to insert index request given its CID // rootCid: root node of the AMT to add the index request in // indexRequestCid: index request to add // returns the new root cid export async function insertIndexRequest(rootCid: CID, indexRequestCid: CID): Promise<CID> { const [rootDag, indexDag] = await Promise.all([kubo.dag.get(rootCid), kubo.dag.get(indexRequestCid)]) - const key = indexDag.value.timestamp.toString(BASE).padStart(KEYSIZE, '0') - return processInode(rootDag.value, key, indexRequestCid) + const key = timestampToKey(indexDag.value.timestamp) + return processInode(rootDag.value, key, insertRequest(indexRequestCid)) } // same as above but with known request -export async function insertKnownIndexRequest(rootCid: CID, indexRequestCid: CID, indexRequest: IndexRequest): Promise<CID> { - const key = indexRequest.timestamp.toString(BASE).padStart(KEYSIZE, '0') +export async function insertKnownIndexRequest( + rootCid: CID, + indexRequestCid: CID, + indexRequest: IndexRequest +): Promise<CID> { + const key = timestampToKey(indexRequest.timestamp) const rootDag = await kubo.dag.get(rootCid) - return processInode(rootDag.value, key, indexRequestCid) + return processInode(rootDag.value, key, insertRequest(indexRequestCid)) } async function resolveHist(): Promise<CID> { @@ -64,14 +94,27 @@ function publishHistory(cid: CID) { }) } -async function processInode(node: IndexInode, key: string, val: CID): Promise<CID> { +// function used to process node +export interface ProcessFunction { + (maybeLeaf: null | IndexLeaf): Promise<CID> +} + +// return bucket corresponding to given letter +function bucket(letter: string): number { + return parseInt(letter[0], BASE) +} + +/// process internal node +/// insert the CID returned by `func` at the position given by `key` +/// returns the CID of the resulting modified parent node +export async function processInode(node: IndexInode, key: string, func: ProcessFunction): Promise<CID> { // console.log("key: " + key) // bucket - const b = parseInt(key[0], BASE) + const b = bucket(key) // if bucket is available, place leaf in it if (node.children[b] === null) { - node.children[b] = [key, await processLeaf(emptyLeaf(), val)] + node.children[b] = [key, await func(null)] } else { // must share bucket with a node const [k1, cid1] = node.children[b] as [string, CID] @@ -89,13 +132,13 @@ async function processInode(node: IndexInode, key: string, val: CID): Promise<CI // we can not enter a leaf at this stage throw Error('should not enter a leaf, this should have been an end') } - node.children[b] = [e.common, await processInode(enterNodeAsInode, e.nk, val)] + node.children[b] = [e.common, await processInode(enterNodeAsInode, e.nk, func)] break case resultType.End: console.log('end') const otherLeaf = (await kubo.dag.get(cid1)).value as IndexLeaf - node.children[b] = [k1, await processLeaf(otherLeaf, val)] + node.children[b] = [k1, await func(otherLeaf)] break case resultType.Diff: @@ -103,7 +146,7 @@ async function processInode(node: IndexInode, key: string, val: CID): Promise<CI console.log('diff on "' + c.common + '" keys "' + c.nk1 + '" / "' + c.nk2 + '"') const newNode = emptyInode() newNode.children[c.b1] = [c.nk1, cid1] - newNode.children[c.b2] = [c.nk2, await processLeaf(emptyLeaf(), val)] + newNode.children[c.b2] = [c.nk2, await func(null)] const newNodeCid = (await kubo.dag.put(newNode)) as CID node.children[b] = [c.common, newNodeCid] break @@ -115,7 +158,9 @@ async function processInode(node: IndexInode, key: string, val: CID): Promise<CI return newCid } -async function processLeaf(node: IndexLeaf, val: CID): Promise<CID> { +/// process leaf in the tree +/// children have to be unique and ordered +export async function processLeaf(node: IndexLeaf, val: CID): Promise<CID> { if (!node.leaf.some((c) => c.toString() == val.toString())) { // only insert if not already there (avoid duplicate) node.leaf.push(val) @@ -126,6 +171,74 @@ async function processLeaf(node: IndexLeaf, val: CID): Promise<CID> { return newCid } +/// merge internal nodes, synchronous implementation +// useful to merge trees +export async function mergeInodesSync(nodeA: IndexInode | IndexLeaf, nodeB: IndexVinode | IndexLeaf): Promise<CID> { + if ((nodeB as IndexLeaf).leaf) { + // these are not internal nodes, but leaves, and we should merge them + const cidSet = new Set([...(nodeA as unknown as IndexLeaf).leaf, ...(nodeB as IndexLeaf).leaf]) + const cidList = Array.from(cidSet).sort() + const newLeaf: IndexLeaf = { + leaf: cidList + } + return kubo.dag.put(newLeaf) as Promise<CID> + } + // if we are not yet at the leaf level, we can safely assume that nodeA and nodeB are indeed inodes + const noda = nodeA as IndexInode + const nodb = nodeB as IndexVinode + + // process each bucket sequencially + for (let b = 0; b < BASE; b++) { + const nAcb = noda.children[b] + const nBcb = nodb.children[b] + if (nAcb == null && nBcb != null) { + // we can concretize nodeB directly + const [kB, childB] = nBcb + noda.children[b] = [kB, await concretize(childB)] + } else if (nAcb != null && nBcb != null) { + // both are non null + const [kA, childA] = nAcb + const [kB, childB] = nBcb + if (kA == kB) { + const childAnode = (await kubo.dag.get(childA)).value + noda.children[b] = [kA, await mergeInodesSync(childAnode, childB)] + } else { + // both keys must have same size since we can only merge nodes from same depth + // then because they are different, the only result type is diffResult + const c = compareKey(kA, kB) as diffResult + const newNode = emptyInode() + newNode.children[c.b1] = [c.nk1, childA] + newNode.children[c.b2] = [c.nk2, await concretize(childB)] + const newNodeCid = (await kubo.dag.put(newNode)) as CID + noda.children[b] = [c.common, newNodeCid] + } + } else { + // keep node untouched + } + } + // now that we have the new node, we can upload it and return its cid + return kubo.dag.put(noda) as Promise<CID> +} + +/// concretize virtual node +async function concretize(node: IndexVinode | IndexLeaf): Promise<CID> { + if ((node as unknown as IndexLeaf).leaf) { + return kubo.dag.put(node) as Promise<CID> + } + // this is a virtual inode + const childrenPromise: Array<null | Promise<[string, CID]>> = (node as unknown as IndexVinode).children.map((c) => { + if (c == null) { + return null + } + const [k, v] = c + return concretize(v).then((cid) => [k, cid as CID]) + }) + const newNode: IndexInode = { + children: await Promise.all(childrenPromise) + } + return kubo.dag.put(newNode) as Promise<CID> +} + export interface diffResult { type: resultType common: string @@ -168,15 +281,15 @@ export function compareKey(k1: string, k2: string): compResult { return { type: resultType.Diff, common, - b1: parseInt(c1, BASE), + b1: bucket(c1), nk1: k1.slice(common.length), - b2: parseInt(c2, BASE), + b2: bucket(c2), nk2: k2.slice(common.length) } } } if (k1.length < k2.length) { - const b = parseInt(k2[l], BASE) + const b = bucket(k2[l]) // we can enter the bucket return { type: resultType.Enter, diff --git a/src/types.ts b/src/types.ts index 0cfd81b..43e2e29 100644 --- a/src/types.ts +++ b/src/types.ts @@ -22,6 +22,16 @@ export interface IndexInode { // but in real world, we will use a bitvector, likely in base16, groups of 4 bits } +// virtual internal node +// same as IndexInode but mutable and only in memory +// allows to process batch insert without having to change all intermediate nodes (including root node) each time +export interface IndexVinode { + // can be a total new index inode (null) or be already aware of its old CID + cid: null | CID + // same as IndexInode + children: (null | [string, IndexVinode | IndexLeaf])[] +} + export function emptyInode(): IndexInode { return { children: new Array(BASE).fill(null) -- GitLab