From 87484913bf7e3d33ac9f2481069fdb302e42006a Mon Sep 17 00:00:00 2001
From: Hugo Trentesaux <hugo@trentesaux.fr>
Date: Tue, 19 Nov 2024 15:52:42 +0100
Subject: [PATCH] wip complete cplus import script

---
 README.md                         |  2 +-
 src/scripts/cesium-plus-import.ts | 57 ++++++++++++++++++++-----------
 2 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 6c28b34..36afca4 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,7 @@ pnpm dev
 # run datapod indexer
 pnpm start
 # run given script
-pnpm exec tsx ./src/script/hello.ts
+pnpm exec tsx ./src/scripts/cesium-plus-import.ts
 ```
 
 More detail in the doc below.
diff --git a/src/scripts/cesium-plus-import.ts b/src/scripts/cesium-plus-import.ts
index b2c3c5c..8398882 100644
--- a/src/scripts/cesium-plus-import.ts
+++ b/src/scripts/cesium-plus-import.ts
@@ -14,10 +14,10 @@ async function fetchRawCplus(id: string): Promise<string> {
 }
 
 /// download all c+ data and add them to a file
-async function downloadAllCplusProfilesRaw(filename: string) {
+async function downloadAllCplusProfilesRaw(endpoint: string, filename: string) {
   const SCROLL_TIME = '5m'
   const PAGE_SIZE = 100
-  const ENDPOINT = 'https://g1.data.e-is.pro'
+  const ENDPOINT = endpoint
   const URL = `${ENDPOINT}/user/profile/_search?scroll=${SCROLL_TIME}&size=${PAGE_SIZE}`
   const NOTIF = 1000
 
@@ -58,11 +58,9 @@ async function downloadAllCplusProfilesRaw(filename: string) {
 }
 
 /// put all raw cplus profiles to ipfs node in index request and write result to a file
-async function wrapRawProfilesInIndexRequest() {
+async function wrapRawProfilesInIndexRequest(input: string, output: string) {
   const LIMIT = 500 // max number of lines to process simultaneously
   const NOTIF = 2000 // log every N lines processed
-  const input = './input/cplusimport.jsonl'
-  const output = './input/cplusIR.txt'
   const rejected = './input/cplusHS.txt'
   const convertImg = false // also upload base64 image as separate file for later reference
   let queueSize = 0
@@ -120,21 +118,40 @@ async function importIrToAMT(rootNodeCid: CID, input: string) {
   await indexRequestsToAMT(requests, rootNodeCid)
 }
 
-// 26 minutes
-// this can take a while because ~50000 profiles are downloaded in raw format independantly
-// downloadAllCplusProfilesRaw('./input/cplusimport.jsonl')
+async function main() {
+  console.log(Date.now(), 'start downloading cplus profiles')
+
+  // 26 minutes
+  // this can take a while because ~50000 profiles are downloaded in raw format independantly
+  // 'https://g1.data.e-is.pro'
+  await downloadAllCplusProfilesRaw('https://g1data.dns1.us', './input/cplusimport.jsonl')
+
+  console.log(Date.now(), 'start wraping in index requests')
+
+  // 12 minutes
+  // speed is reduced to limit RAM usage and concurrent writes to IPFS node
+  await wrapRawProfilesInIndexRequest('./input/cplusimport.jsonl', './input/cplusIR.txt')
+
+  console.log(Date.now(), 'start adding to current root cid')
 
-// 12 minutes
-// speed is reduced to limit RAM usage and concurrent writes to IPFS node
-// wrapRawProfilesInIndexRequest()
+  // 3 minutes
+  // import by batch and logs successive cids
 
-// 3 minutes
-// import by batch and logs successive cids
-// importIrToAMT(EMPTY_NODE_CID, './input/cplusIR.txt')
-const rootCID = CID.parse("bafyreieybuh6l6bpz3jn76wqbf7jweb4ptq55n3avbaxe3nhkeiabxzmze")
-importIrToAMT(rootCID, './input/devIr+labels.txt')
-// → bafyreih4jspnqnsd4o3sdqv7c765uyylhtlh5majjw6aq6clilkq7tmqey (old with simple nodes)
-// → bafyreicklp6mtqzubxxti2uddggmsttjbdcuahm2uqxuri4z6duypliax4 (new with more context and fixed labels)
+  // import to an empty node
+  // importIrToAMT(EMPTY_NODE_CID, './input/cplusIR.txt')
+
+  // import dev index requests on an existing root
+  // const rootCID = CID.parse('bafyreieybuh6l6bpz3jn76wqbf7jweb4ptq55n3avbaxe3nhkeiabxzmze')
+  // importIrToAMT(rootCID, './input/devIr+labels.txt')
+
+  // import all cplus data on an existing root
+  // root CID we want to import to (example: current root CID of datapod → "tamt")
+  const rootCID = CID.parse('bafyreih5fz46ezyf25jns6azxxjrjr625f2hvkt24zj4w7wm3dfedfwgv4')
+  await importIrToAMT(rootCID, './input/cplusIR.txt')
+  // latest log is the new root cid to start the indexer on
+  // pnpm start /ipfs/<root cid>
+
+  console.log(Date.now(), 'finished. Now start your indexer.')
+}
 
-// bafyreieybuh6l6bpz3jn76wqbf7jweb4ptq55n3avbaxe3nhkeiabxzmze
-// bafyreifhhss6h5j72ewdcr6b75wda4573wtskjfp2pqiae5l73efwvrvjy
\ No newline at end of file
+main()
-- 
GitLab