Skip to content

Instantly share code, notes, and snippets.

@JohnAllen
Last active March 26, 2021 20:12
Show Gist options
  • Save JohnAllen/8652c8d5a274fb6be4073c11b485d54d to your computer and use it in GitHub Desktop.
Save JohnAllen/8652c8d5a274fb6be4073c11b485d54d to your computer and use it in GitHub Desktop.
const csv = require('csvtojson')
const fs = require('fs')
const csvFilePath='sentences.csv'
const csvWritePath='sentences.json'
const readStream= fs.createReadStream(csvFilePath);
const writeStream = fs.createWriteStream(csvWritePath)
// This writes JSON but incorrectly. You will see. Look at the options for this CSV library
readStream.pipe(csv({
noheader: true,
downstreamFormat: 'array',
headers: ['num', 'lang', 'content']
})).pipe(writeStream);
import json
import codecs
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
from google.cloud import firestore
class FirestorePush:
def __init__(self):
# Login with your Firestore credentials
cred = credentials.Certificate('/Users/john/lingo-e9b0f-34bc68d28b7b.json')
self.db_admin = firebase_admin.initialize_app(cred)
self.db = firestore.Client()
# My sample DB
with codecs.open('sentences.json', 'r', 'utf-8') as f:
self.records_db = json.load(f)
# Method to push bulk records to Firestore
def push(self):
# Get a ref to Firestore database.
records_collection = self.db.collection('records')
# This is just for logging purposes.
total = len(self.records_db)
idx = 0
# Start a batch
batch = self.db.batch()
for record in self.records_db:
# Commit the batch at every 500th record.
if idx % 500 == 0:
if idx > 0:
print('Committing..')
batch.commit()
# Start a new batch for the next iteration.
batch = self.db.batch()
idx += 1
print(str(idx) + str('/') + str(total) + ': ' + str(record['code']))
record_ref = records_collection.document(record['id'])
# Include current record in batch
batch.set(record_ref, record)
# Include current record in batch
if idx % 500 != 0:
print('Committing..')
batch.commit()
if __name__ == '__main__':
f = FirestorePush()
f.push()
const { readFile } = require('fs').promises
const fs = require("fs");
const { promisify } = require('util')
const parse = promisify(require('csv-parse'))
const { Firestore } = require('@google-cloud/firestore')
const admin = require('firebase-admin');
const projectId = 'lingo-e9b0f';
process.env.FIRESTORE_EMULATOR_HOST = 'localhost:8002';
admin.initializeApp({ projectId });
const db = admin.firestore();
async function importSentences(records) {
console.log(`Beginning sentence import`)
const batchCommits = [];
let batch = db.batch();
for (let i = 0; i < 10000; i++) {
const record = records[i]
const sentence = {
num: record[0],
lang: record[1],
content: record[2]
}
const docRef = db.collection('sentences').doc()
console.log(`new sentence: ${sentence.content}`)
batch.set(docRef, sentence);
if (i % 500 === 0) {
await batch.commit()
console.log(`new batch`)
batch = db.batch();
}
}
batchCommits.push(batch.commit());
return Promise.all(batchCommits);
}
async function importCsv() {
console.log(`Beginning to read CSV file`)
const fileContents = await readFile('eng_sentences.tsv', 'utf8')
console.log(`Done reading CSV file`)
console.log(`Beginning to parse contents`)
const records = await parse(fileContents, { relax: true, delimiter: '\t' })
console.log(`Done parsing contents`)
console.log(`Beginning to writeToFirestore`)
try {
await importSentences(records)
} catch (e) {
console.error(e)
process.exit(1)
}
console.log(`Wrote ${records.length} records`)
}
importCsv().catch(e => console.error(e))
const fs = require('fs')
const path = require('path')
const { promisify } = require('util')
const parse = require('csv-parse/lib/sync')
// const parse = promisify(require('csv-parse'))
const { Firestore } = require('@google-cloud/firestore')
const admin = require('firebase-admin') // required
const csvParser = require("csv-parser");
const projectId = 'lingo-e9b0f'
process.env.FIRESTORE_EMULATOR_HOST = 'localhost:8002'
admin.initializeApp({ projectId })
const db = admin.firestore()
const getTSVFiles = () => {
let files = fs.readdirSync('sentence-pairs')
files = files.filter(el => path.extname(el) === '.tsv')
// console.log(files)
return files
}
const handleError = (error) => {
console.error(error)
};
const commitMultiple = (batchFactories) => {
let result = Promise.resolve();
const TIMEOUT = 1100;
batchFactories.forEach((promiseFactory, index) => {
result = result
.then(() => {
return new Promise((resolve) => {
setTimeout(resolve, TIMEOUT);
});
})
.then(promiseFactory)
.then(() =>
console.log(`Committed ${index + 1} of ${batchFactories.length}`)
);
});
return result;
};
const writeSentences = (tsvFile) => {
let currentBatchIndex = 0;
const batchesArray = [];
let batchDocsCount = 0;
const batchFactories = []
// batchFactories.push();
const filePath = './sentence-pairs/' + tsvFile
console.log(filePath)
return Promise.resolve()
.then(() => {
// const data = [];
return fs
.createReadStream(filePath)
.pipe(csvParser())
.on("data", (row) => {
const sentenceOne = Object.keys(row)[0].split('\t')
const sentenceTwo = Object.keys(row)[1].split('\t')
const sentences = [
{
num: sentenceOne[0],
content: sentenceOne[1],
},
{
num: sentenceTwo[0],
content: sentenceTwo[1],
}
]
const batch = (() => {
const batchPart = db.batch();
if (batchesArray.length === 0) {
batchesArray.push(batchPart);
}
if ((batchDocsCount = 499)) {
batchDocsCount = 0;
batchesArray.push(batchPart);
currentBatchIndex++;
batchFactories.push(() => batchPart.commit());
}
return batchesArray[currentBatchIndex];
})();
batchDocsCount+= 2; // because the rows are actually two sentences for some reason
const ref = db.collection("sentence-pairs").doc();
batch.set(ref, JSON);
})
.on("end", Promise.resolve);
})
.then(() => commitMultiple(batchFactories))
.catch(handleError);
};
async function importSentencePairs() {
const files = getTSVFiles()
files.forEach(file => {
// console.log(file)
writeSentences(file)
})
}
async function importCsv() {
try {
await importSentencePairs()
} catch (e) {
console.error(e)
process.exit(1)
}
// console.log(`Wrote ${records.length} records`)
}
importCsv(process.argv[2]).catch(e => console.error(e))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment