Last active
March 26, 2021 20:12
-
-
Save JohnAllen/8652c8d5a274fb6be4073c11b485d54d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const csv = require('csvtojson') | |
const fs = require('fs') | |
const csvFilePath='sentences.csv' | |
const csvWritePath='sentences.json' | |
const readStream= fs.createReadStream(csvFilePath); | |
const writeStream = fs.createWriteStream(csvWritePath) | |
// This writes JSON but incorrectly. You will see. Look at the options for this CSV library | |
readStream.pipe(csv({ | |
noheader: true, | |
downstreamFormat: 'array', | |
headers: ['num', 'lang', 'content'] | |
})).pipe(writeStream); | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import codecs | |
import firebase_admin | |
from firebase_admin import credentials | |
from firebase_admin import firestore | |
from google.cloud import firestore | |
class FirestorePush: | |
def __init__(self): | |
# Login with your Firestore credentials | |
cred = credentials.Certificate('/Users/john/lingo-e9b0f-34bc68d28b7b.json') | |
self.db_admin = firebase_admin.initialize_app(cred) | |
self.db = firestore.Client() | |
# My sample DB | |
with codecs.open('sentences.json', 'r', 'utf-8') as f: | |
self.records_db = json.load(f) | |
# Method to push bulk records to Firestore | |
def push(self): | |
# Get a ref to Firestore database. | |
records_collection = self.db.collection('records') | |
# This is just for logging purposes. | |
total = len(self.records_db) | |
idx = 0 | |
# Start a batch | |
batch = self.db.batch() | |
for record in self.records_db: | |
# Commit the batch at every 500th record. | |
if idx % 500 == 0: | |
if idx > 0: | |
print('Committing..') | |
batch.commit() | |
# Start a new batch for the next iteration. | |
batch = self.db.batch() | |
idx += 1 | |
print(str(idx) + str('/') + str(total) + ': ' + str(record['code'])) | |
record_ref = records_collection.document(record['id']) | |
# Include current record in batch | |
batch.set(record_ref, record) | |
# Include current record in batch | |
if idx % 500 != 0: | |
print('Committing..') | |
batch.commit() | |
if __name__ == '__main__': | |
f = FirestorePush() | |
f.push() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const { readFile } = require('fs').promises | |
const fs = require("fs"); | |
const { promisify } = require('util') | |
const parse = promisify(require('csv-parse')) | |
const { Firestore } = require('@google-cloud/firestore') | |
const admin = require('firebase-admin'); | |
const projectId = 'lingo-e9b0f'; | |
process.env.FIRESTORE_EMULATOR_HOST = 'localhost:8002'; | |
admin.initializeApp({ projectId }); | |
const db = admin.firestore(); | |
async function importSentences(records) { | |
console.log(`Beginning sentence import`) | |
const batchCommits = []; | |
let batch = db.batch(); | |
for (let i = 0; i < 10000; i++) { | |
const record = records[i] | |
const sentence = { | |
num: record[0], | |
lang: record[1], | |
content: record[2] | |
} | |
const docRef = db.collection('sentences').doc() | |
console.log(`new sentence: ${sentence.content}`) | |
batch.set(docRef, sentence); | |
if (i % 500 === 0) { | |
await batch.commit() | |
console.log(`new batch`) | |
batch = db.batch(); | |
} | |
} | |
batchCommits.push(batch.commit()); | |
return Promise.all(batchCommits); | |
} | |
async function importCsv() { | |
console.log(`Beginning to read CSV file`) | |
const fileContents = await readFile('eng_sentences.tsv', 'utf8') | |
console.log(`Done reading CSV file`) | |
console.log(`Beginning to parse contents`) | |
const records = await parse(fileContents, { relax: true, delimiter: '\t' }) | |
console.log(`Done parsing contents`) | |
console.log(`Beginning to writeToFirestore`) | |
try { | |
await importSentences(records) | |
} catch (e) { | |
console.error(e) | |
process.exit(1) | |
} | |
console.log(`Wrote ${records.length} records`) | |
} | |
importCsv().catch(e => console.error(e)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs') | |
const path = require('path') | |
const { promisify } = require('util') | |
const parse = require('csv-parse/lib/sync') | |
// const parse = promisify(require('csv-parse')) | |
const { Firestore } = require('@google-cloud/firestore') | |
const admin = require('firebase-admin') // required | |
const csvParser = require("csv-parser"); | |
const projectId = 'lingo-e9b0f' | |
process.env.FIRESTORE_EMULATOR_HOST = 'localhost:8002' | |
admin.initializeApp({ projectId }) | |
const db = admin.firestore() | |
const getTSVFiles = () => { | |
let files = fs.readdirSync('sentence-pairs') | |
files = files.filter(el => path.extname(el) === '.tsv') | |
// console.log(files) | |
return files | |
} | |
const handleError = (error) => { | |
console.error(error) | |
}; | |
const commitMultiple = (batchFactories) => { | |
let result = Promise.resolve(); | |
const TIMEOUT = 1100; | |
batchFactories.forEach((promiseFactory, index) => { | |
result = result | |
.then(() => { | |
return new Promise((resolve) => { | |
setTimeout(resolve, TIMEOUT); | |
}); | |
}) | |
.then(promiseFactory) | |
.then(() => | |
console.log(`Committed ${index + 1} of ${batchFactories.length}`) | |
); | |
}); | |
return result; | |
}; | |
const writeSentences = (tsvFile) => { | |
let currentBatchIndex = 0; | |
const batchesArray = []; | |
let batchDocsCount = 0; | |
const batchFactories = [] | |
// batchFactories.push(); | |
const filePath = './sentence-pairs/' + tsvFile | |
console.log(filePath) | |
return Promise.resolve() | |
.then(() => { | |
// const data = []; | |
return fs | |
.createReadStream(filePath) | |
.pipe(csvParser()) | |
.on("data", (row) => { | |
const sentenceOne = Object.keys(row)[0].split('\t') | |
const sentenceTwo = Object.keys(row)[1].split('\t') | |
const sentences = [ | |
{ | |
num: sentenceOne[0], | |
content: sentenceOne[1], | |
}, | |
{ | |
num: sentenceTwo[0], | |
content: sentenceTwo[1], | |
} | |
] | |
const batch = (() => { | |
const batchPart = db.batch(); | |
if (batchesArray.length === 0) { | |
batchesArray.push(batchPart); | |
} | |
if ((batchDocsCount = 499)) { | |
batchDocsCount = 0; | |
batchesArray.push(batchPart); | |
currentBatchIndex++; | |
batchFactories.push(() => batchPart.commit()); | |
} | |
return batchesArray[currentBatchIndex]; | |
})(); | |
batchDocsCount+= 2; // because the rows are actually two sentences for some reason | |
const ref = db.collection("sentence-pairs").doc(); | |
batch.set(ref, JSON); | |
}) | |
.on("end", Promise.resolve); | |
}) | |
.then(() => commitMultiple(batchFactories)) | |
.catch(handleError); | |
}; | |
async function importSentencePairs() { | |
const files = getTSVFiles() | |
files.forEach(file => { | |
// console.log(file) | |
writeSentences(file) | |
}) | |
} | |
async function importCsv() { | |
try { | |
await importSentencePairs() | |
} catch (e) { | |
console.error(e) | |
process.exit(1) | |
} | |
// console.log(`Wrote ${records.length} records`) | |
} | |
importCsv(process.argv[2]).catch(e => console.error(e)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment