Skip to content
Snippets Groups Projects
Select Git revision
  • 73691b926575b3a02f97272b73f0702495dee2d0
  • main default protected
2 results

cesium-plus-import.ts

Blame
  • cesium-plus-import.ts 4.78 KiB
    import { CID } from 'multiformats'
    import { wrapCplusInIndexRequest, indexRequestsToAMT } from '../cesium-plus'
    import { createInterface } from 'readline'
    import { appendFile, createReadStream } from 'fs'
    import { timestampToKey } from '../processor'
    import { readFile } from 'fs/promises'
    import { EMPTY_NODE_CID } from '../consts'
    
    // fetch raw profile
    // fetchRawCplus('38QzVPhRLbEiqJtvCmRY6A6SraheNA6fJbomFX75b2qb').then(console.log)
    async function fetchRawCplus(id: string): Promise<string> {
      const ENDPOINT = 'https://g1.data.e-is.pro'
      return fetch(ENDPOINT + '/user/profile/' + id + '/_source').then((b) => b.text())
    }
    
    /// download all c+ data and add them to a file
    async function downloadAllCplusProfilesRaw(filename: string) {
      const SCROLL_TIME = '5m'
      const PAGE_SIZE = 100
      const ENDPOINT = 'https://g1.data.e-is.pro'
      const URL = `${ENDPOINT}/user/profile/_search?scroll=${SCROLL_TIME}&size=${PAGE_SIZE}`
      const NOTIF = 1000
    
      // first batch
      let batch = await fetch(`${URL}&filter_path=_scroll_id,hits.total,hits.hits._id`, {
        method: 'post',
        body: JSON.stringify({
          query: { match_all: {} }
        })
      }).then((b) => b.json())
      let scroll_id = batch._scroll_id
      const total = batch.hits.total
      let scrolled = PAGE_SIZE
    
      console.log(`downloading ${total} cplus profiles...`)
    
      // process batches while available
      while (scrolled < total) {
        // add raw source to the file
        for (const hit of batch.hits.hits) {
          fetchRawCplus(hit._id).then((cplusRaw) => appendFile(filename, cplusRaw + '\n', () => {}))
        }
        // take next batch
        batch = await fetch(ENDPOINT + '/_search/scroll', {
          method: 'post',
          body: JSON.stringify({
            scroll: SCROLL_TIME,
            scroll_id: scroll_id
          })
        }).then((b) => b.json())
        scroll_id = batch._scroll_id
        scrolled += PAGE_SIZE
        if (scrolled % NOTIF == 0) {
          console.log(`${scrolled.toString().padStart(5)}/${total}`)
        }
      }
      console.log(`${total}/${total}, done.`)
    }
    
    /// put all raw cplus profiles to ipfs node in index request and write result to a file
    async function wrapRawProfilesInIndexRequest() {
      const LIMIT = 500 // max number of lines to process simultaneously
      const NOTIF = 2000 // log every N lines processed
      const input = './input/cplusimport.jsonl'
      const output = './input/cplusIR.txt'
      const rejected = './input/cplusHS.txt'
      const convertImg = false // also upload base64 image as separate file for later reference
      let queueSize = 0
      let readTotal = 0