From 858cdf8afec2dbdc240ac4934b6c0d4a61611b2c Mon Sep 17 00:00:00 2001
From: Hugo Trentesaux <hugo@trentesaux.fr>
Date: Fri, 5 Apr 2024 16:40:45 +0200
Subject: [PATCH] prepare merge algo for C+ import

---
 src/processor.ts | 145 +++++++++++++++++++++++++++++++++++++++++------
 src/types.ts     |  10 ++++
 2 files changed, 139 insertions(+), 16 deletions(-)

diff --git a/src/processor.ts b/src/processor.ts
index 5eee07a..8f6596e 100644
--- a/src/processor.ts
+++ b/src/processor.ts
@@ -1,7 +1,20 @@
 import { CID } from 'multiformats'
 import { kubo } from './kubo'
 import { IPNS, BASE, IPNS_HIST, KEYSIZE } from './consts'
-import { type IndexInode, emptyInode, type IndexLeaf, type IndexRequest, emptyLeaf, type IndexHist } from './types'
+import {
+  type IndexInode,
+  emptyInode,
+  type IndexLeaf,
+  emptyLeaf,
+  type IndexVinode,
+  type IndexRequest,
+  type IndexHist
+} from './types'
+
+// convert timestamp to key
+function timestampToKey(timestamp: number): string {
+  return timestamp.toString(BASE).padStart(KEYSIZE, '0')
+}
 
 // add cid to index queue
 export async function addToIndexQueue(cid: CID, indexRequest: IndexRequest) {
@@ -23,21 +36,38 @@ export async function addToIndexQueue(cid: CID, indexRequest: IndexRequest) {
   }
 }
 
+// returns a process function suitable for processInode that simply inserts request in a leaf
+function insertRequest(indexRequestCid: CID): ProcessFunction {
+  return (maybeLeaf) => {
+    if (maybeLeaf == null) {
+      // in this case we want to create a new leaf
+      return processLeaf(emptyLeaf(), indexRequestCid)
+    } else {
+      // in this case we want to insert indexRequestCid in existing leaf
+      return processLeaf(maybeLeaf, indexRequestCid)
+    }
+  }
+}
+
 // simplest way to insert index request given its CID
 // rootCid: root node of the AMT to add the index request in
 // indexRequestCid: index request to add
 // returns the new root cid
 export async function insertIndexRequest(rootCid: CID, indexRequestCid: CID): Promise<CID> {
   const [rootDag, indexDag] = await Promise.all([kubo.dag.get(rootCid), kubo.dag.get(indexRequestCid)])
-  const key = indexDag.value.timestamp.toString(BASE).padStart(KEYSIZE, '0')
-  return processInode(rootDag.value, key, indexRequestCid)
+  const key = timestampToKey(indexDag.value.timestamp)
+  return processInode(rootDag.value, key, insertRequest(indexRequestCid))
 }
 
 // same as above but with known request
-export async function insertKnownIndexRequest(rootCid: CID, indexRequestCid: CID, indexRequest: IndexRequest): Promise<CID> {
-  const key = indexRequest.timestamp.toString(BASE).padStart(KEYSIZE, '0')
+export async function insertKnownIndexRequest(
+  rootCid: CID,
+  indexRequestCid: CID,
+  indexRequest: IndexRequest
+): Promise<CID> {
+  const key = timestampToKey(indexRequest.timestamp)
   const rootDag = await kubo.dag.get(rootCid)
-  return processInode(rootDag.value, key, indexRequestCid)
+  return processInode(rootDag.value, key, insertRequest(indexRequestCid))
 }
 
 async function resolveHist(): Promise<CID> {
@@ -64,14 +94,27 @@ function publishHistory(cid: CID) {
   })
 }
 
-async function processInode(node: IndexInode, key: string, val: CID): Promise<CID> {
+// function used to process node
+export interface ProcessFunction {
+  (maybeLeaf: null | IndexLeaf): Promise<CID>
+}
+
+// return bucket corresponding to given letter
+function bucket(letter: string): number {
+  return parseInt(letter[0], BASE)
+}
+
+/// process internal node
+/// insert the CID returned by `func` at the position given by `key`
+/// returns the CID of the resulting modified parent node
+export async function processInode(node: IndexInode, key: string, func: ProcessFunction): Promise<CID> {
   // console.log("key: " + key)
 
   // bucket
-  const b = parseInt(key[0], BASE)
+  const b = bucket(key)
   // if bucket is available, place leaf in it
   if (node.children[b] === null) {
-    node.children[b] = [key, await processLeaf(emptyLeaf(), val)]
+    node.children[b] = [key, await func(null)]
   } else {
     // must share bucket with a node
     const [k1, cid1] = node.children[b] as [string, CID]
@@ -89,13 +132,13 @@ async function processInode(node: IndexInode, key: string, val: CID): Promise<CI
           // we can not enter a leaf at this stage
           throw Error('should not enter a leaf, this should have been an end')
         }
-        node.children[b] = [e.common, await processInode(enterNodeAsInode, e.nk, val)]
+        node.children[b] = [e.common, await processInode(enterNodeAsInode, e.nk, func)]
         break
 
       case resultType.End:
         console.log('end')
         const otherLeaf = (await kubo.dag.get(cid1)).value as IndexLeaf
-        node.children[b] = [k1, await processLeaf(otherLeaf, val)]
+        node.children[b] = [k1, await func(otherLeaf)]
         break
 
       case resultType.Diff:
@@ -103,7 +146,7 @@ async function processInode(node: IndexInode, key: string, val: CID): Promise<CI
         console.log('diff on "' + c.common + '" keys "' + c.nk1 + '" / "' + c.nk2 + '"')
         const newNode = emptyInode()
         newNode.children[c.b1] = [c.nk1, cid1]
-        newNode.children[c.b2] = [c.nk2, await processLeaf(emptyLeaf(), val)]
+        newNode.children[c.b2] = [c.nk2, await func(null)]
         const newNodeCid = (await kubo.dag.put(newNode)) as CID
         node.children[b] = [c.common, newNodeCid]
         break
@@ -115,7 +158,9 @@ async function processInode(node: IndexInode, key: string, val: CID): Promise<CI
   return newCid
 }
 
-async function processLeaf(node: IndexLeaf, val: CID): Promise<CID> {
+/// process leaf in the tree
+/// children have to be unique and ordered
+export async function processLeaf(node: IndexLeaf, val: CID): Promise<CID> {
   if (!node.leaf.some((c) => c.toString() == val.toString())) {
     // only insert if not already there (avoid duplicate)
     node.leaf.push(val)
@@ -126,6 +171,74 @@ async function processLeaf(node: IndexLeaf, val: CID): Promise<CID> {
   return newCid
 }
 
+/// merge internal nodes, synchronous implementation
+// useful to merge trees
+export async function mergeInodesSync(nodeA: IndexInode | IndexLeaf, nodeB: IndexVinode | IndexLeaf): Promise<CID> {
+  if ((nodeB as IndexLeaf).leaf) {
+    // these are not internal nodes, but leaves, and we should merge them
+    const cidSet = new Set([...(nodeA as unknown as IndexLeaf).leaf, ...(nodeB as IndexLeaf).leaf])
+    const cidList = Array.from(cidSet).sort()
+    const newLeaf: IndexLeaf = {
+      leaf: cidList
+    }
+    return kubo.dag.put(newLeaf) as Promise<CID>
+  }
+  // if we are not yet at the leaf level, we can safely assume that nodeA and nodeB are indeed inodes
+  const noda = nodeA as IndexInode
+  const nodb = nodeB as IndexVinode
+
+  // process each bucket sequencially
+  for (let b = 0; b < BASE; b++) {
+    const nAcb = noda.children[b]
+    const nBcb = nodb.children[b]
+    if (nAcb == null && nBcb != null) {
+      // we can concretize nodeB directly
+      const [kB, childB] = nBcb
+      noda.children[b] = [kB, await concretize(childB)]
+    } else if (nAcb != null && nBcb != null) {
+      // both are non null
+      const [kA, childA] = nAcb
+      const [kB, childB] = nBcb
+      if (kA == kB) {
+        const childAnode = (await kubo.dag.get(childA)).value
+        noda.children[b] = [kA, await mergeInodesSync(childAnode, childB)]
+      } else {
+        // both keys must have same size since we can only merge nodes from same depth
+        // then because they are different, the only result type is diffResult
+        const c = compareKey(kA, kB) as diffResult
+        const newNode = emptyInode()
+        newNode.children[c.b1] = [c.nk1, childA]
+        newNode.children[c.b2] = [c.nk2, await concretize(childB)]
+        const newNodeCid = (await kubo.dag.put(newNode)) as CID
+        noda.children[b] = [c.common, newNodeCid]
+      }
+    } else {
+      // keep node untouched
+    }
+  }
+  // now that we have the new node, we can upload it and return its cid
+  return kubo.dag.put(noda) as Promise<CID>
+}
+
+/// concretize virtual node
+async function concretize(node: IndexVinode | IndexLeaf): Promise<CID> {
+  if ((node as unknown as IndexLeaf).leaf) {
+    return kubo.dag.put(node) as Promise<CID>
+  }
+  // this is a virtual inode
+  const childrenPromise: Array<null | Promise<[string, CID]>> = (node as unknown as IndexVinode).children.map((c) => {
+    if (c == null) {
+      return null
+    }
+    const [k, v] = c
+    return concretize(v).then((cid) => [k, cid as CID])
+  })
+  const newNode: IndexInode = {
+    children: await Promise.all(childrenPromise)
+  }
+  return kubo.dag.put(newNode) as Promise<CID>
+}
+
 export interface diffResult {
   type: resultType
   common: string
@@ -168,15 +281,15 @@ export function compareKey(k1: string, k2: string): compResult {
       return {
         type: resultType.Diff,
         common,
-        b1: parseInt(c1, BASE),
+        b1: bucket(c1),
         nk1: k1.slice(common.length),
-        b2: parseInt(c2, BASE),
+        b2: bucket(c2),
         nk2: k2.slice(common.length)
       }
     }
   }
   if (k1.length < k2.length) {
-    const b = parseInt(k2[l], BASE)
+    const b = bucket(k2[l])
     // we can enter the bucket
     return {
       type: resultType.Enter,
diff --git a/src/types.ts b/src/types.ts
index 0cfd81b..43e2e29 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -22,6 +22,16 @@ export interface IndexInode {
   // but in real world, we will use a bitvector, likely in base16, groups of 4 bits
 }
 
+// virtual internal node
+// same as IndexInode but mutable and only in memory
+// allows to process batch insert without having to change all intermediate nodes (including root node) each time
+export interface IndexVinode {
+  // can be a total new index inode (null) or be already aware of its old CID
+  cid: null | CID
+  // same as IndexInode
+  children: (null | [string, IndexVinode | IndexLeaf])[]
+}
+
 export function emptyInode(): IndexInode {
   return {
     children: new Array(BASE).fill(null)
-- 
GitLab