Skip to content
Snippets Groups Projects
Commit 87484913 authored by Hugo Trentesaux's avatar Hugo Trentesaux
Browse files

wip complete cplus import script

parent 91e05649
No related branches found
No related tags found
No related merge requests found
......@@ -66,7 +66,7 @@ pnpm dev
# run datapod indexer
pnpm start
# run given script
pnpm exec tsx ./src/script/hello.ts
pnpm exec tsx ./src/scripts/cesium-plus-import.ts
```
More detail in the doc below.
......
......@@ -14,10 +14,10 @@ async function fetchRawCplus(id: string): Promise<string> {
}
/// download all c+ data and add them to a file
async function downloadAllCplusProfilesRaw(filename: string) {
async function downloadAllCplusProfilesRaw(endpoint: string, filename: string) {
const SCROLL_TIME = '5m'
const PAGE_SIZE = 100
const ENDPOINT = 'https://g1.data.e-is.pro'
const ENDPOINT = endpoint
const URL = `${ENDPOINT}/user/profile/_search?scroll=${SCROLL_TIME}&size=${PAGE_SIZE}`
const NOTIF = 1000
......@@ -58,11 +58,9 @@ async function downloadAllCplusProfilesRaw(filename: string) {
}
/// put all raw cplus profiles to ipfs node in index request and write result to a file
async function wrapRawProfilesInIndexRequest() {
async function wrapRawProfilesInIndexRequest(input: string, output: string) {
const LIMIT = 500 // max number of lines to process simultaneously
const NOTIF = 2000 // log every N lines processed
const input = './input/cplusimport.jsonl'
const output = './input/cplusIR.txt'
const rejected = './input/cplusHS.txt'
const convertImg = false // also upload base64 image as separate file for later reference
let queueSize = 0
......@@ -120,21 +118,40 @@ async function importIrToAMT(rootNodeCid: CID, input: string) {
await indexRequestsToAMT(requests, rootNodeCid)
}
// 26 minutes
// this can take a while because ~50000 profiles are downloaded in raw format independantly
// downloadAllCplusProfilesRaw('./input/cplusimport.jsonl')
async function main() {
console.log(Date.now(), 'start downloading cplus profiles')
// 26 minutes
// this can take a while because ~50000 profiles are downloaded in raw format independantly
// 'https://g1.data.e-is.pro'
await downloadAllCplusProfilesRaw('https://g1data.dns1.us', './input/cplusimport.jsonl')
console.log(Date.now(), 'start wraping in index requests')
// 12 minutes
// speed is reduced to limit RAM usage and concurrent writes to IPFS node
await wrapRawProfilesInIndexRequest('./input/cplusimport.jsonl', './input/cplusIR.txt')
console.log(Date.now(), 'start adding to current root cid')
// 12 minutes
// speed is reduced to limit RAM usage and concurrent writes to IPFS node
// wrapRawProfilesInIndexRequest()
// 3 minutes
// import by batch and logs successive cids
// 3 minutes
// import by batch and logs successive cids
// importIrToAMT(EMPTY_NODE_CID, './input/cplusIR.txt')
const rootCID = CID.parse("bafyreieybuh6l6bpz3jn76wqbf7jweb4ptq55n3avbaxe3nhkeiabxzmze")
importIrToAMT(rootCID, './input/devIr+labels.txt')
// → bafyreih4jspnqnsd4o3sdqv7c765uyylhtlh5majjw6aq6clilkq7tmqey (old with simple nodes)
// → bafyreicklp6mtqzubxxti2uddggmsttjbdcuahm2uqxuri4z6duypliax4 (new with more context and fixed labels)
// import to an empty node
// importIrToAMT(EMPTY_NODE_CID, './input/cplusIR.txt')
// import dev index requests on an existing root
// const rootCID = CID.parse('bafyreieybuh6l6bpz3jn76wqbf7jweb4ptq55n3avbaxe3nhkeiabxzmze')
// importIrToAMT(rootCID, './input/devIr+labels.txt')
// import all cplus data on an existing root
// root CID we want to import to (example: current root CID of datapod → "tamt")
const rootCID = CID.parse('bafyreih5fz46ezyf25jns6azxxjrjr625f2hvkt24zj4w7wm3dfedfwgv4')
await importIrToAMT(rootCID, './input/cplusIR.txt')
// latest log is the new root cid to start the indexer on
// pnpm start /ipfs/<root cid>
console.log(Date.now(), 'finished. Now start your indexer.')
}
// bafyreieybuh6l6bpz3jn76wqbf7jweb4ptq55n3avbaxe3nhkeiabxzmze
// bafyreifhhss6h5j72ewdcr6b75wda4573wtskjfp2pqiae5l73efwvrvjy
\ No newline at end of file
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment