Skip to content

Instantly share code, notes, and snippets.

@JohnAllen
Last active March 30, 2021 18:49
Show Gist options
  • Save JohnAllen/f8b0026b75f403c894d966df6f445111 to your computer and use it in GitHub Desktop.
Save JohnAllen/f8b0026b75f403c894d966df6f445111 to your computer and use it in GitHub Desktop.
const stringSimilarity = require("string-similarity");
const arrayOfLangNames = Object.values(languages)
function getLangCodeFromLanguage(languageName) {
let code = Object.keys(languages).find(key =>
languages[key] === languageName
)
if (!code) {
// console.log(arrayOfLangNames)
const bestMatch = stringSimilarity.findBestMatch(languageName, arrayOfLangNames);
code = Object.keys(languages).find(key =>
languages[key] === bestMatch
)
// console.log(languageName)
}
return code
}
exports.getLangCodeFromLanguage = getLangCodeFromLanguage
const { readFile } = require('fs').promises
const { promisify } = require('util')
const parse = promisify(require('csv-parse'))
const { Firestore } = require('@google-cloud/firestore')
const admin = require('firebase-admin'); // required
const projectId = 'lingo-e9b0f';
process.env.FIRESTORE_EMULATOR_HOST = 'localhost:8002';
admin.initializeApp({ projectId });
const db = admin.firestore();
function importLanguages(records) {
const batchCommits = [];
let batch = db.batch();
const languageCodes = []
records.forEach((record, i) => {
console.log(i)
const language = {
code: record[0],
name: record[1]
}
if (languageCodes.indexOf(language.code) < 0) {
console.log(`Found new lang: ${language.code}`)
languageCodes.push(language.code)
const docRef = db.collection('languages').doc()
batch.set(docRef, language);
}
});
batchCommits.push(batch.commit());
return Promise.all(batchCommits);
}
async function importCsv() {
console.log(`Beginning to read CSV file`)
const fileContents = await readFile('languages.tsv', 'utf8')
console.log(`Done reading CSV file`)
console.log(`Beginning to parse contents`)
const records = await parse(fileContents, { relax: true, delimiter: '\t' })
console.log(`Done parsing contents`)
console.log(`Beginning to writeToFirestore`)
try {
await importLanguages(records)
} catch (e) {
console.error(e)
process.exit(1)
}
console.log(`Wrote ${records.length} records`)
}
importCsv().catch(e => console.error(e))
const fs = require('fs')
const path = require('path')
// const { promisify } = require('util')
// const parse = require('csv-parse/lib/sync')
// const parse = promisify(require('csv-parse'))
// const { Firestore } = require('@google-cloud/firestore')
const admin = require('firebase-admin') // required
const csvParser = require('csv-parser')
const { getLangCodeFromLanguage } = require('./getCodeFromLanguage')
// const { languages } = require('./language-object')
const projectId = 'lingo-e9b0f'
process.env.FIRESTORE_EMULATOR_HOST = 'localhost:8001'
admin.initializeApp({ projectId })
const db = admin.firestore()
const getTSVFiles = () => {
let files = fs.readdirSync('sentence-pairs')
files = files.filter(el => path.extname(el) === '.tsv')
// console.log(files)
return files
}
const handleError = (error) => {
console.error(error)
}
async function writeToDb(batches) {
console.log(`beginning write of ${batches.length} batches`);
await Promise.all(batches)
console.log("done with file batches");
}
const commitMultiple = (batchFactories) => {
let result = Promise.resolve();
const TIMEOUT = 1100;
batchFactories.forEach((batch, index) => {
result = result
.then(() => {
return new Promise((resolve) => {
console.log(`ready to commit batch`)
setTimeout(resolve, TIMEOUT);
});
})
.then(() => {
console.log(`about to commit batch`)
return batch.commit()
})
.then(result => {
// console.log(`result of batch.commit: ${result}`)
console.log(`Committed ${index + 1} of ${batchFactories.length}`)
} );
});
return result;
};
function checkSentenceValuesAreNotUndefined(obj) {
for (let key in obj) {
if (!obj[key]) {
// console.log(`skipping bad sentence because of: ${key}`)
// console.log(obj)
return false
}
}
return true
}
const getLangNamesFromFilePath = (filePath) => {
const langCodes = []
const srcLang = filePath.split('Sentence pairs in ')[1].split('-')[0]
const srcCode = getLangCodeFromLanguage(srcLang)
langCodes.push(srcCode)
const tgtLang = filePath.split('Sentence pairs in ')[1].split('-')[1].trim()
const tgtCode = getLangCodeFromLanguage(tgtLang)
langCodes.push(tgtCode)
return langCodes
}
const writeSentences = (tsvFile) => {
// console.log(`new file: ${tsvFile}`)
let batchDocsCount = 0
let commitCounter = 0;
const batches = [];
const filePath = './sentence-pairs/' + tsvFile
// console.log(filePath)
const languageCodes = getLangNamesFromFilePath(filePath)
batches[commitCounter] = db.batch();
return Promise.resolve()
.then(() => {
return fs
.createReadStream(filePath)
.pipe(csvParser({ separator: '\t', headers: false }))
.on('data', async (row) => {
// console.log(row)
const sentence =
{
srcNum: row[0],
srcLang: languageCodes[0],
srcContent: row[1],
tgtNum: row[2],
tgtLang: languageCodes[1],
tgtContent: row[3]
}
// console.log(sentence)
const sentenceIsValid = checkSentenceValuesAreNotUndefined(sentence)
if (sentenceIsValid) {
if (batchDocsCount < 490) {
// console.log(batchDocsCount)
batchDocsCount += 1
const ref = db.collection('sentence-pairs').doc()
// console.log(sentence)
batches[commitCounter].set(ref, sentence);
} else {
batchDocsCount = 0;
commitCounter = commitCounter + 1;
batches[commitCounter] = db.batch();
}
} else {
// console.log(`bad sentence: ${sentence}`)
}
})
.on('end',async () => {
console.log('done reading file')
await commitMultiple(batches);
})
})
.catch(handleError)
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms))
}
async function importSentencePairs() {
const files = getTSVFiles()
for (let file of files) {
console.log(file)
//'Sentence pairs in English-German - 2021-03-25.tsv'
await writeSentences(file)
await sleep(5000)
}
}
async function importCsv() {
try {
await importSentencePairs()
} catch (e) {
console.error(e)
process.exit(1)
}
// console.log(`Wrote ${records.length} records`)
}
importCsv(process.argv[2]).catch(e => console.error(e))
const languages = {
'ara': 'Arabic',
'eng': 'English',
'jpn': 'Japanese',
'fra': 'French',
'deu': 'German',
'spa': 'Spanish',
'ita': 'Italian',
'vie': 'Vietnamese',
'rus': 'Russian',
'cmn': 'Mandarin Chinese',
'kor': 'Korean',
'nld': 'Dutch',
'heb': 'Hebrew',
'ind': 'Indonesian',
'por': 'Portuguese',
'fin': 'Finnish',
'bul': 'Bulgarian',
'ukr': 'Ukrainian',
'ces': 'Czech',
'epo': 'Esperanto',
'ell': 'Greek',
'tur': 'Turkish',
'swe': 'Swedish',
'nob': 'Norwegian Bokmål',
'zsm': 'Malay',
'est': 'Estonian',
'kat': 'Georgian',
'pol': 'Polish',
'swh': 'Swahili',
'lat': 'Latin',
'wuu': 'Shanghainese',
'arz': 'Egyptian Arabic',
'bel': 'Belarusian',
'hun': 'Hungarian',
'isl': 'Icelandic',
'sqi': 'Albanian',
'yue': 'Cantonese',
'afr': 'Afrikaans',
'fao': 'Faroese',
'fry': 'Frisian',
'bre': 'Breton',
'ron': 'Romanian',
'uig': 'Uyghur',
'uzb': 'Uzbek',
'nno': 'Norwegian Nynorsk',
'srp': 'Serbian',
'tat': 'Tatar',
'yid': 'Yiddish',
'pes': 'Persian',
'nan': 'Min Nan Chinese',
'eus': 'Basque',
'slk': 'Slovak',
'dan': 'Danish',
'hye': 'Armenian',
'acm': 'Iraqi Arabic',
'san': 'Sanskrit',
'urd': 'Urdu',
'hin': 'Hindi',
'ben': 'Bengali',
'cyc': 'CycL',
'cat': 'Catalan',
'kaz': 'Kazakh',
'lvs': 'Latvian',
'bos': 'Bosnian',
'hrv': 'Croatian',
'orv': 'Old East Slavic',
'cha': 'Chamorro',
'tgl': 'Tagalog',
'que': 'Quechua',
'mon': 'Mongolian',
'lit': 'Lithuanian',
'glg': 'Galician',
'gle': 'Irish',
'ina': 'Interlingua',
'jbo': 'Lojban',
'tok': 'Toki Pona',
'ain': 'Ainu',
'scn': 'Sicilian',
'mal': 'Malayalam',
'tlh': 'Klingon',
'slv': 'Slovenian',
'tha': 'Thai',
'lzh': 'Literary Chinese',
'oss': 'Ossetian',
'roh': 'Romansh',
'vol': 'Volapük',
'gla': 'Scottish Gaelic',
'ido': 'Ido',
'ast': 'Asturian',
'ile': 'Interlingue',
'oci': 'Occitan',
'xal': 'Kalmyk',
'ang': 'Old English',
'dsb': 'Lower Sorbian',
'hsb': 'Upper Sorbian',
'ksh': 'Kölsch',
'cym': 'Welsh',
'ewe': 'Ewe',
'sjn': 'Sindarin',
'tel': 'Telugu',
'tpi': 'Tok Pisin',
'qya': 'Quenya',
'nov': 'Novial',
'mri': 'Maori',
'lld': 'Ladin',
'ber': 'Berber',
'xho': 'Xhosa',
'pnb': 'Punjabi (Western)',
'mlg': 'Malagasy',
'grn': 'Guarani',
'lad': 'Ladino',
'pms': 'Piedmontese',
'avk': 'Kotava',
'mar': 'Marathi',
'tpw': 'Old Tupi',
'tgk': 'Tajik',
'prg': 'Old Prussian',
'npi': 'Nepali',
'mlt': 'Maltese',
'ckt': 'Chukchi',
'cor': 'Cornish',
'aze': 'Azerbaijani',
'khm': 'Khmer',
'lao': 'Lao',
'bod': 'Tibetan',
'hil': 'Hiligaynon',
'arq': 'Algerian Arabic',
'pcd': 'Picard',
'grc': 'Ancient Greek',
'amh': 'Amharic',
'awa': 'Awadhi',
'bho': 'Bhojpuri',
'cbk': 'Chavacano',
'enm': 'Middle English',
'frm': 'Middle French',
'hat': 'Haitian Creole',
'jdt': 'Juhuri (Judeo-Tat)',
'kal': 'Greenlandic',
'mhr': 'Meadow Mari',
'nah': 'Nahuatl',
'pdc': 'Pennsylvania German',
'sin': 'Sinhala',
'tuk': 'Turkmen',
'wln': 'Walloon',
'bak': 'Bashkir',
'hau': 'Hausa',
'ltz': 'Luxembourgish',
'mgm': 'Mambae',
'som': 'Somali',
'zul': 'Zulu',
'haw': 'Hawaiian',
'kir': 'Kyrgyz',
'mkd': 'Macedonian',
'mrj': 'Hill Mari',
'ppl': 'Pipil',
'yor': 'Yoruba',
'kin': 'Kinyarwanda',
'shs': 'Shuswap',
'chv': 'Chuvash',
'lkt': 'Lakota',
'ota': 'Ottoman Turkish',
'sna': 'Shona',
'mnw': 'Mon',
'nog': 'Nogai',
'sah': 'Yakut',
'abk': 'Abkhaz',
'tet': 'Tetun',
'tam': 'Tamil',
'udm': 'Udmurt',
'kum': 'Kumyk',
'crh': 'Crimean Tatar',
'nya': 'Chinyanja',
'liv': 'Livonian',
'nav': 'Navajo',
'chr': 'Cherokee',
'guj': 'Gujarati',
'pan': 'Punjabi (Eastern)',
'kha': 'Khasi',
'jav': 'Javanese',
'zza': 'Zaza',
'egl': 'Emilian',
'tir': 'Tigrinya',
'sme': 'Northern Sami',
'max': 'North Moluccan Malay',
'pam': 'Kapampangan',
'dtp': 'Central Dusun',
'cho': 'Choctaw',
'kzj': 'Coastal Kadazan',
'smo': 'Samoan',
'fij': 'Fijian',
'wol': 'Wolof',
'che': 'Chechen',
'sag': 'Sango',
'hif': 'Fiji Hindi',
'ton': 'Tongan',
'ngt': 'Ngeq',
'kam': 'Kamba',
'vec': 'Venetian',
'mya': 'Burmese',
'gil': 'Gilbertese',
'myv': 'Erzya',
'niu': 'Niuean',
'vro': 'Võro',
'glv': 'Manx',
'lin': 'Lingala',
'lfn': 'Lingua Franca Nova',
'pus': 'Pashto',
'kjh': 'Khakas',
'dng': 'Dungan',
'fur': 'Friulian',
'mah': 'Marshallese',
'pfl': 'Palatine German',
'kan': 'Kannada',
'crs': 'Seychellois Creole',
'gsw': 'Swiss German',
'osx': 'Old Saxon',
'sux': 'Sumerian',
'sco': 'Scots',
'moh': 'Mohawk',
'ceb': 'Cebuano',
'lmo': 'Lombard',
'tso': 'Tsonga',
'bua': 'Buryat',
'aym': 'Aymara',
'ilo': 'Ilocano',
'kaa': 'Karakalpak',
'nlv': 'Orizaba Nahuatl',
'ngu': 'Guerrero Nahuatl',
'ady': 'Adyghe',
'brx': 'Bodo',
'gag': 'Gagauz',
'rom': 'Romani',
'lzz': 'Laz',
'fuc': 'Pulaar',
'umb': 'Umbundu',
'tkl': 'Tokelauan',
'sot': 'Southern Sotho',
'alt': 'Southern Altai',
'war': 'Waray',
'snd': 'Sindhi',
'tsn': 'Setswana',
'srd': 'Sardinian',
'pau': 'Palauan',
'gbm': 'Garhwali',
'oji': 'Ojibwe',
'lug': 'Luganda',
'hak': 'Hakka Chinese',
'bam': 'Bambara',
'arg': 'Aragonese',
'asm': 'Assamese',
'fuv': 'Nigerian Fulfulde',
'hoc': 'Ho',
'sun': 'Sundanese',
'apc': 'North Levantine Arabic',
'tyv': 'Tuvinian',
'krc': 'Karachay-Balkar',
'pap': 'Papiamento',
'non': 'Old Norse',
'ori': 'Odia (Oriya)',
'iba': 'Iban',
'oar': 'Old Aramaic',
'ary': 'Moroccan Arabic',
'cyo': 'Cuyonon',
'ibo': 'Igbo',
'csb': 'Kashubian',
'lou': 'Louisiana Creole',
'urh': 'Urhobo',
'mvv': 'Tagal Murut',
'mdf': 'Moksha',
'pag': 'Pangasinan',
'cos': 'Corsican',
'hnj': 'Hmong Njua (Green)',
'rif': 'Tarifit',
'nch': 'Central Huasteca Nahuatl',
'kek': 'Kekchi',
'ssw': 'Swazi',
'ban': 'Balinese',
'aii': 'Assyrian Neo-Aramaic',
'tvl': 'Tuvaluan',
'kxi': 'Keningau Murut',
'bvy': 'Baybayanon',
'mfe': 'Morisyen',
'mww': 'Hmong Daw (White)',
'bcl': 'Central Bikol',
'nau': 'Nauruan',
'zlm': 'Malay (Vernacular)',
'nst': 'Naga (Tangshang)',
'quc': 'K\'iche\'',
'afb': 'Gulf Arabic',
'min': 'Minangkabau',
'tmw': 'Temuan',
'cjy': 'Jin Chinese',
'mai': 'Maithili',
'mad': 'Madurese',
'bjn': 'Banjar',
'got': 'Gothic',
'hsn': 'Xiang Chinese',
'gan': 'Gan Chinese',
'bar': 'Bavarian',
'tzl': 'Talossan',
'sgs': 'Samogitian',
'ldn': 'Láadan',
'dws': 'Dutton World Speedwords',
'afh': 'Afrihili',
'krl': 'Karelian',
'vep': 'Veps',
'rue': 'Rusyn',
'tah': 'Tahitian',
'tly': 'Talysh',
'mic': 'Mi\'kmaq',
'ext': 'Extremaduran',
'swg': 'Swabian',
'izh': 'Ingrian',
'sma': 'Southern Sami',
'jam': 'Jamaican Patois',
'mwl': 'Mirandese',
'kpv': 'Komi-Zyrian',
'cmo': 'Central Mnong',
'koi': 'Komi-Permyak',
'ike': 'Inuktitut',
'kab': 'Kabyle',
'run': 'Kirundi',
'aln': 'Gheg Albanian',
'akl': 'Aklanon',
'mnc': 'Manchu',
'kas': 'Kashmiri',
'otk': 'Old Turkish',
'aoz': 'Uab Meto',
'shy': 'Tachawit',
'fkv': 'Kven Finnish',
'rap': 'Rapa Nui',
'gcf': 'Guadeloupean Creole French',
'gos': 'Gronings',
'lij': 'Ligurian',
'tig': 'Tigre',
'thv': 'Tahaggart Tamahaq',
'div': 'Dhivehi',
'hrx': 'Hunsrik',
'cay': 'Cayuga',
'gaa': 'Ga',
'cpi': 'Chinese Pidgin English',
'bzt': 'Brithenig',
'ltg': 'Latgalian',
'emx': 'Erromintxela',
'gom': 'Konkani (Goan)',
'chg': 'Chagatai',
'xmf': 'Mingrelian',
'osp': 'Old Spanish',
'tmr': 'Jewish Babylonian Aramaic',
'ryu': 'Okinawan',
'evn': 'Evenki',
'bis': 'Bislama',
'stq': 'Saterland Frisian',
'fro': 'Old French',
'syc': 'Syriac',
'frr': 'North Frisian',
'nys': 'Nyungar',
'tts': 'Isan',
'toi': 'Tonga (Zambezi)',
'new': 'Newari',
'jpa': 'Jewish Palestinian Aramaic',
'phn': 'Phoenician',
'rel': 'Rendille',
'iii': 'Nuosu',
'drt': 'Drents',
'laa': 'Southern Subanen',
'chn': 'Chinook Jargon',
'bal': 'Baluchi',
'pli': 'Pali',
'hbo': 'Ancient Hebrew',
'ajp': 'South Levantine Arabic',
'hax': 'Southern Haida',
'hdn': 'Northern Haida',
'xqa': 'Karakhanid',
'crk': 'Plains Cree',
'yua': 'Yucatec Maya',
'pal': 'Middle Persian (Pahlavi)',
'mni': 'Meitei',
'ayl': 'Libyan Arabic',
'lut': 'Lushootseed',
'ofs': 'Old Frisian',
'nus': 'Nuer',
'ckb': 'Central Kurdish (Soranî)',
'kmr': 'Northern Kurdish (Kurmancî)',
'sdh': 'Southern Kurdish',
'kiu': 'Northern Zaza (Kirmanjki)',
'diq': 'Southern Zaza (Dimli)',
'zgh': 'Standard Moroccan Tamazight',
'bfz': 'Mahasu Pahari',
'qxq': 'Qashqai',
'klj': 'Khalaj',
'dar': 'Dargwa',
'lbe': 'Lak',
'ava': 'Avar',
'mus': 'Muskogee (Creek)',
'abq': 'Abaza',
'inh': 'Ingush'
}
exports.languages = languages
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment