diff --git a/README.md b/README.md index 6c28b3462f79d853552742299828826b698f859d..36afca46d961ad9883692258568f88ca6def4dc4 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ pnpm dev # run datapod indexer pnpm start # run given script -pnpm exec tsx ./src/script/hello.ts +pnpm exec tsx ./src/scripts/cesium-plus-import.ts ``` More detail in the doc below. diff --git a/src/scripts/cesium-plus-import.ts b/src/scripts/cesium-plus-import.ts index b2c3c5cf52b2619bbd34faf3607edb7ae51525d9..83988825b9aed9f2a6fe8b522b60dcfc5571b1f6 100644 --- a/src/scripts/cesium-plus-import.ts +++ b/src/scripts/cesium-plus-import.ts @@ -14,10 +14,10 @@ async function fetchRawCplus(id: string): Promise<string> { } /// download all c+ data and add them to a file -async function downloadAllCplusProfilesRaw(filename: string) { +async function downloadAllCplusProfilesRaw(endpoint: string, filename: string) { const SCROLL_TIME = '5m' const PAGE_SIZE = 100 - const ENDPOINT = 'https://g1.data.e-is.pro' + const ENDPOINT = endpoint const URL = `${ENDPOINT}/user/profile/_search?scroll=${SCROLL_TIME}&size=${PAGE_SIZE}` const NOTIF = 1000 @@ -58,11 +58,9 @@ async function downloadAllCplusProfilesRaw(filename: string) { } /// put all raw cplus profiles to ipfs node in index request and write result to a file -async function wrapRawProfilesInIndexRequest() { +async function wrapRawProfilesInIndexRequest(input: string, output: string) { const LIMIT = 500 // max number of lines to process simultaneously const NOTIF = 2000 // log every N lines processed - const input = './input/cplusimport.jsonl' - const output = './input/cplusIR.txt' const rejected = './input/cplusHS.txt' const convertImg = false // also upload base64 image as separate file for later reference let queueSize = 0 @@ -120,21 +118,40 @@ async function importIrToAMT(rootNodeCid: CID, input: string) { await indexRequestsToAMT(requests, rootNodeCid) } -// 26 minutes -// this can take a while because ~50000 profiles are downloaded in raw format independantly -// downloadAllCplusProfilesRaw('./input/cplusimport.jsonl') +async function main() { + console.log(Date.now(), 'start downloading cplus profiles') + + // 26 minutes + // this can take a while because ~50000 profiles are downloaded in raw format independantly + // 'https://g1.data.e-is.pro' + await downloadAllCplusProfilesRaw('https://g1data.dns1.us', './input/cplusimport.jsonl') + + console.log(Date.now(), 'start wraping in index requests') + + // 12 minutes + // speed is reduced to limit RAM usage and concurrent writes to IPFS node + await wrapRawProfilesInIndexRequest('./input/cplusimport.jsonl', './input/cplusIR.txt') + + console.log(Date.now(), 'start adding to current root cid') -// 12 minutes -// speed is reduced to limit RAM usage and concurrent writes to IPFS node -// wrapRawProfilesInIndexRequest() + // 3 minutes + // import by batch and logs successive cids -// 3 minutes -// import by batch and logs successive cids -// importIrToAMT(EMPTY_NODE_CID, './input/cplusIR.txt') -const rootCID = CID.parse("bafyreieybuh6l6bpz3jn76wqbf7jweb4ptq55n3avbaxe3nhkeiabxzmze") -importIrToAMT(rootCID, './input/devIr+labels.txt') -// → bafyreih4jspnqnsd4o3sdqv7c765uyylhtlh5majjw6aq6clilkq7tmqey (old with simple nodes) -// → bafyreicklp6mtqzubxxti2uddggmsttjbdcuahm2uqxuri4z6duypliax4 (new with more context and fixed labels) + // import to an empty node + // importIrToAMT(EMPTY_NODE_CID, './input/cplusIR.txt') + + // import dev index requests on an existing root + // const rootCID = CID.parse('bafyreieybuh6l6bpz3jn76wqbf7jweb4ptq55n3avbaxe3nhkeiabxzmze') + // importIrToAMT(rootCID, './input/devIr+labels.txt') + + // import all cplus data on an existing root + // root CID we want to import to (example: current root CID of datapod → "tamt") + const rootCID = CID.parse('bafyreih5fz46ezyf25jns6azxxjrjr625f2hvkt24zj4w7wm3dfedfwgv4') + await importIrToAMT(rootCID, './input/cplusIR.txt') + // latest log is the new root cid to start the indexer on + // pnpm start /ipfs/<root cid> + + console.log(Date.now(), 'finished. Now start your indexer.') +} -// bafyreieybuh6l6bpz3jn76wqbf7jweb4ptq55n3avbaxe3nhkeiabxzmze -// bafyreifhhss6h5j72ewdcr6b75wda4573wtskjfp2pqiae5l73efwvrvjy \ No newline at end of file +main()