Select Git revision
cesium-plus-import.ts
-
Hugo Trentesaux authoredHugo Trentesaux authored
cesium-plus-import.ts 4.78 KiB
import { CID } from 'multiformats'
import { wrapCplusInIndexRequest, indexRequestsToAMT } from '../cesium-plus'
import { createInterface } from 'readline'
import { appendFile, createReadStream } from 'fs'
import { timestampToKey } from '../processor'
import { readFile } from 'fs/promises'
import { EMPTY_NODE_CID } from '../consts'
// fetch raw profile
// fetchRawCplus('38QzVPhRLbEiqJtvCmRY6A6SraheNA6fJbomFX75b2qb').then(console.log)
async function fetchRawCplus(id: string): Promise<string> {
const ENDPOINT = 'https://g1.data.e-is.pro'
return fetch(ENDPOINT + '/user/profile/' + id + '/_source').then((b) => b.text())
}
/// download all c+ data and add them to a file
async function downloadAllCplusProfilesRaw(filename: string) {
const SCROLL_TIME = '5m'
const PAGE_SIZE = 100
const ENDPOINT = 'https://g1.data.e-is.pro'
const URL = `${ENDPOINT}/user/profile/_search?scroll=${SCROLL_TIME}&size=${PAGE_SIZE}`
const NOTIF = 1000
// first batch
let batch = await fetch(`${URL}&filter_path=_scroll_id,hits.total,hits.hits._id`, {
method: 'post',
body: JSON.stringify({
query: { match_all: {} }
})
}).then((b) => b.json())
let scroll_id = batch._scroll_id
const total = batch.hits.total
let scrolled = PAGE_SIZE
console.log(`downloading ${total} cplus profiles...`)
// process batches while available
while (scrolled < total) {
// add raw source to the file
for (const hit of batch.hits.hits) {
fetchRawCplus(hit._id).then((cplusRaw) => appendFile(filename, cplusRaw + '\n', () => {}))
}
// take next batch
batch = await fetch(ENDPOINT + '/_search/scroll', {
method: 'post',
body: JSON.stringify({
scroll: SCROLL_TIME,
scroll_id: scroll_id
})
}).then((b) => b.json())
scroll_id = batch._scroll_id
scrolled += PAGE_SIZE
if (scrolled % NOTIF == 0) {
console.log(`${scrolled.toString().padStart(5)}/${total}`)
}
}
console.log(`${total}/${total}, done.`)
}
/// put all raw cplus profiles to ipfs node in index request and write result to a file
async function wrapRawProfilesInIndexRequest() {
const LIMIT = 500 // max number of lines to process simultaneously
const NOTIF = 2000 // log every N lines processed
const input = './input/cplusimport.jsonl'
const output = './input/cplusIR.txt'
const rejected = './input/cplusHS.txt'
const convertImg = false // also upload base64 image as separate file for later reference
let queueSize = 0
let readTotal = 0