Last active
March 30, 2021 18:49
-
-
Save JohnAllen/f8b0026b75f403c894d966df6f445111 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const stringSimilarity = require("string-similarity"); | |
const arrayOfLangNames = Object.values(languages) | |
function getLangCodeFromLanguage(languageName) { | |
let code = Object.keys(languages).find(key => | |
languages[key] === languageName | |
) | |
if (!code) { | |
// console.log(arrayOfLangNames) | |
const bestMatch = stringSimilarity.findBestMatch(languageName, arrayOfLangNames); | |
code = Object.keys(languages).find(key => | |
languages[key] === bestMatch | |
) | |
// console.log(languageName) | |
} | |
return code | |
} | |
exports.getLangCodeFromLanguage = getLangCodeFromLanguage |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const { readFile } = require('fs').promises | |
const { promisify } = require('util') | |
const parse = promisify(require('csv-parse')) | |
const { Firestore } = require('@google-cloud/firestore') | |
const admin = require('firebase-admin'); // required | |
const projectId = 'lingo-e9b0f'; | |
process.env.FIRESTORE_EMULATOR_HOST = 'localhost:8002'; | |
admin.initializeApp({ projectId }); | |
const db = admin.firestore(); | |
function importLanguages(records) { | |
const batchCommits = []; | |
let batch = db.batch(); | |
const languageCodes = [] | |
records.forEach((record, i) => { | |
console.log(i) | |
const language = { | |
code: record[0], | |
name: record[1] | |
} | |
if (languageCodes.indexOf(language.code) < 0) { | |
console.log(`Found new lang: ${language.code}`) | |
languageCodes.push(language.code) | |
const docRef = db.collection('languages').doc() | |
batch.set(docRef, language); | |
} | |
}); | |
batchCommits.push(batch.commit()); | |
return Promise.all(batchCommits); | |
} | |
async function importCsv() { | |
console.log(`Beginning to read CSV file`) | |
const fileContents = await readFile('languages.tsv', 'utf8') | |
console.log(`Done reading CSV file`) | |
console.log(`Beginning to parse contents`) | |
const records = await parse(fileContents, { relax: true, delimiter: '\t' }) | |
console.log(`Done parsing contents`) | |
console.log(`Beginning to writeToFirestore`) | |
try { | |
await importLanguages(records) | |
} catch (e) { | |
console.error(e) | |
process.exit(1) | |
} | |
console.log(`Wrote ${records.length} records`) | |
} | |
importCsv().catch(e => console.error(e)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs') | |
const path = require('path') | |
// const { promisify } = require('util') | |
// const parse = require('csv-parse/lib/sync') | |
// const parse = promisify(require('csv-parse')) | |
// const { Firestore } = require('@google-cloud/firestore') | |
const admin = require('firebase-admin') // required | |
const csvParser = require('csv-parser') | |
const { getLangCodeFromLanguage } = require('./getCodeFromLanguage') | |
// const { languages } = require('./language-object') | |
const projectId = 'lingo-e9b0f' | |
process.env.FIRESTORE_EMULATOR_HOST = 'localhost:8001' | |
admin.initializeApp({ projectId }) | |
const db = admin.firestore() | |
const getTSVFiles = () => { | |
let files = fs.readdirSync('sentence-pairs') | |
files = files.filter(el => path.extname(el) === '.tsv') | |
// console.log(files) | |
return files | |
} | |
const handleError = (error) => { | |
console.error(error) | |
} | |
async function writeToDb(batches) { | |
console.log(`beginning write of ${batches.length} batches`); | |
await Promise.all(batches) | |
console.log("done with file batches"); | |
} | |
const commitMultiple = (batchFactories) => { | |
let result = Promise.resolve(); | |
const TIMEOUT = 1100; | |
batchFactories.forEach((batch, index) => { | |
result = result | |
.then(() => { | |
return new Promise((resolve) => { | |
console.log(`ready to commit batch`) | |
setTimeout(resolve, TIMEOUT); | |
}); | |
}) | |
.then(() => { | |
console.log(`about to commit batch`) | |
return batch.commit() | |
}) | |
.then(result => { | |
// console.log(`result of batch.commit: ${result}`) | |
console.log(`Committed ${index + 1} of ${batchFactories.length}`) | |
} ); | |
}); | |
return result; | |
}; | |
function checkSentenceValuesAreNotUndefined(obj) { | |
for (let key in obj) { | |
if (!obj[key]) { | |
// console.log(`skipping bad sentence because of: ${key}`) | |
// console.log(obj) | |
return false | |
} | |
} | |
return true | |
} | |
const getLangNamesFromFilePath = (filePath) => { | |
const langCodes = [] | |
const srcLang = filePath.split('Sentence pairs in ')[1].split('-')[0] | |
const srcCode = getLangCodeFromLanguage(srcLang) | |
langCodes.push(srcCode) | |
const tgtLang = filePath.split('Sentence pairs in ')[1].split('-')[1].trim() | |
const tgtCode = getLangCodeFromLanguage(tgtLang) | |
langCodes.push(tgtCode) | |
return langCodes | |
} | |
const writeSentences = (tsvFile) => { | |
// console.log(`new file: ${tsvFile}`) | |
let batchDocsCount = 0 | |
let commitCounter = 0; | |
const batches = []; | |
const filePath = './sentence-pairs/' + tsvFile | |
// console.log(filePath) | |
const languageCodes = getLangNamesFromFilePath(filePath) | |
batches[commitCounter] = db.batch(); | |
return Promise.resolve() | |
.then(() => { | |
return fs | |
.createReadStream(filePath) | |
.pipe(csvParser({ separator: '\t', headers: false })) | |
.on('data', async (row) => { | |
// console.log(row) | |
const sentence = | |
{ | |
srcNum: row[0], | |
srcLang: languageCodes[0], | |
srcContent: row[1], | |
tgtNum: row[2], | |
tgtLang: languageCodes[1], | |
tgtContent: row[3] | |
} | |
// console.log(sentence) | |
const sentenceIsValid = checkSentenceValuesAreNotUndefined(sentence) | |
if (sentenceIsValid) { | |
if (batchDocsCount < 490) { | |
// console.log(batchDocsCount) | |
batchDocsCount += 1 | |
const ref = db.collection('sentence-pairs').doc() | |
// console.log(sentence) | |
batches[commitCounter].set(ref, sentence); | |
} else { | |
batchDocsCount = 0; | |
commitCounter = commitCounter + 1; | |
batches[commitCounter] = db.batch(); | |
} | |
} else { | |
// console.log(`bad sentence: ${sentence}`) | |
} | |
}) | |
.on('end',async () => { | |
console.log('done reading file') | |
await commitMultiple(batches); | |
}) | |
}) | |
.catch(handleError) | |
} | |
function sleep(ms) { | |
return new Promise(resolve => setTimeout(resolve, ms)) | |
} | |
async function importSentencePairs() { | |
const files = getTSVFiles() | |
for (let file of files) { | |
console.log(file) | |
//'Sentence pairs in English-German - 2021-03-25.tsv' | |
await writeSentences(file) | |
await sleep(5000) | |
} | |
} | |
async function importCsv() { | |
try { | |
await importSentencePairs() | |
} catch (e) { | |
console.error(e) | |
process.exit(1) | |
} | |
// console.log(`Wrote ${records.length} records`) | |
} | |
importCsv(process.argv[2]).catch(e => console.error(e)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const languages = { | |
'ara': 'Arabic', | |
'eng': 'English', | |
'jpn': 'Japanese', | |
'fra': 'French', | |
'deu': 'German', | |
'spa': 'Spanish', | |
'ita': 'Italian', | |
'vie': 'Vietnamese', | |
'rus': 'Russian', | |
'cmn': 'Mandarin Chinese', | |
'kor': 'Korean', | |
'nld': 'Dutch', | |
'heb': 'Hebrew', | |
'ind': 'Indonesian', | |
'por': 'Portuguese', | |
'fin': 'Finnish', | |
'bul': 'Bulgarian', | |
'ukr': 'Ukrainian', | |
'ces': 'Czech', | |
'epo': 'Esperanto', | |
'ell': 'Greek', | |
'tur': 'Turkish', | |
'swe': 'Swedish', | |
'nob': 'Norwegian Bokmål', | |
'zsm': 'Malay', | |
'est': 'Estonian', | |
'kat': 'Georgian', | |
'pol': 'Polish', | |
'swh': 'Swahili', | |
'lat': 'Latin', | |
'wuu': 'Shanghainese', | |
'arz': 'Egyptian Arabic', | |
'bel': 'Belarusian', | |
'hun': 'Hungarian', | |
'isl': 'Icelandic', | |
'sqi': 'Albanian', | |
'yue': 'Cantonese', | |
'afr': 'Afrikaans', | |
'fao': 'Faroese', | |
'fry': 'Frisian', | |
'bre': 'Breton', | |
'ron': 'Romanian', | |
'uig': 'Uyghur', | |
'uzb': 'Uzbek', | |
'nno': 'Norwegian Nynorsk', | |
'srp': 'Serbian', | |
'tat': 'Tatar', | |
'yid': 'Yiddish', | |
'pes': 'Persian', | |
'nan': 'Min Nan Chinese', | |
'eus': 'Basque', | |
'slk': 'Slovak', | |
'dan': 'Danish', | |
'hye': 'Armenian', | |
'acm': 'Iraqi Arabic', | |
'san': 'Sanskrit', | |
'urd': 'Urdu', | |
'hin': 'Hindi', | |
'ben': 'Bengali', | |
'cyc': 'CycL', | |
'cat': 'Catalan', | |
'kaz': 'Kazakh', | |
'lvs': 'Latvian', | |
'bos': 'Bosnian', | |
'hrv': 'Croatian', | |
'orv': 'Old East Slavic', | |
'cha': 'Chamorro', | |
'tgl': 'Tagalog', | |
'que': 'Quechua', | |
'mon': 'Mongolian', | |
'lit': 'Lithuanian', | |
'glg': 'Galician', | |
'gle': 'Irish', | |
'ina': 'Interlingua', | |
'jbo': 'Lojban', | |
'tok': 'Toki Pona', | |
'ain': 'Ainu', | |
'scn': 'Sicilian', | |
'mal': 'Malayalam', | |
'tlh': 'Klingon', | |
'slv': 'Slovenian', | |
'tha': 'Thai', | |
'lzh': 'Literary Chinese', | |
'oss': 'Ossetian', | |
'roh': 'Romansh', | |
'vol': 'Volapük', | |
'gla': 'Scottish Gaelic', | |
'ido': 'Ido', | |
'ast': 'Asturian', | |
'ile': 'Interlingue', | |
'oci': 'Occitan', | |
'xal': 'Kalmyk', | |
'ang': 'Old English', | |
'dsb': 'Lower Sorbian', | |
'hsb': 'Upper Sorbian', | |
'ksh': 'Kölsch', | |
'cym': 'Welsh', | |
'ewe': 'Ewe', | |
'sjn': 'Sindarin', | |
'tel': 'Telugu', | |
'tpi': 'Tok Pisin', | |
'qya': 'Quenya', | |
'nov': 'Novial', | |
'mri': 'Maori', | |
'lld': 'Ladin', | |
'ber': 'Berber', | |
'xho': 'Xhosa', | |
'pnb': 'Punjabi (Western)', | |
'mlg': 'Malagasy', | |
'grn': 'Guarani', | |
'lad': 'Ladino', | |
'pms': 'Piedmontese', | |
'avk': 'Kotava', | |
'mar': 'Marathi', | |
'tpw': 'Old Tupi', | |
'tgk': 'Tajik', | |
'prg': 'Old Prussian', | |
'npi': 'Nepali', | |
'mlt': 'Maltese', | |
'ckt': 'Chukchi', | |
'cor': 'Cornish', | |
'aze': 'Azerbaijani', | |
'khm': 'Khmer', | |
'lao': 'Lao', | |
'bod': 'Tibetan', | |
'hil': 'Hiligaynon', | |
'arq': 'Algerian Arabic', | |
'pcd': 'Picard', | |
'grc': 'Ancient Greek', | |
'amh': 'Amharic', | |
'awa': 'Awadhi', | |
'bho': 'Bhojpuri', | |
'cbk': 'Chavacano', | |
'enm': 'Middle English', | |
'frm': 'Middle French', | |
'hat': 'Haitian Creole', | |
'jdt': 'Juhuri (Judeo-Tat)', | |
'kal': 'Greenlandic', | |
'mhr': 'Meadow Mari', | |
'nah': 'Nahuatl', | |
'pdc': 'Pennsylvania German', | |
'sin': 'Sinhala', | |
'tuk': 'Turkmen', | |
'wln': 'Walloon', | |
'bak': 'Bashkir', | |
'hau': 'Hausa', | |
'ltz': 'Luxembourgish', | |
'mgm': 'Mambae', | |
'som': 'Somali', | |
'zul': 'Zulu', | |
'haw': 'Hawaiian', | |
'kir': 'Kyrgyz', | |
'mkd': 'Macedonian', | |
'mrj': 'Hill Mari', | |
'ppl': 'Pipil', | |
'yor': 'Yoruba', | |
'kin': 'Kinyarwanda', | |
'shs': 'Shuswap', | |
'chv': 'Chuvash', | |
'lkt': 'Lakota', | |
'ota': 'Ottoman Turkish', | |
'sna': 'Shona', | |
'mnw': 'Mon', | |
'nog': 'Nogai', | |
'sah': 'Yakut', | |
'abk': 'Abkhaz', | |
'tet': 'Tetun', | |
'tam': 'Tamil', | |
'udm': 'Udmurt', | |
'kum': 'Kumyk', | |
'crh': 'Crimean Tatar', | |
'nya': 'Chinyanja', | |
'liv': 'Livonian', | |
'nav': 'Navajo', | |
'chr': 'Cherokee', | |
'guj': 'Gujarati', | |
'pan': 'Punjabi (Eastern)', | |
'kha': 'Khasi', | |
'jav': 'Javanese', | |
'zza': 'Zaza', | |
'egl': 'Emilian', | |
'tir': 'Tigrinya', | |
'sme': 'Northern Sami', | |
'max': 'North Moluccan Malay', | |
'pam': 'Kapampangan', | |
'dtp': 'Central Dusun', | |
'cho': 'Choctaw', | |
'kzj': 'Coastal Kadazan', | |
'smo': 'Samoan', | |
'fij': 'Fijian', | |
'wol': 'Wolof', | |
'che': 'Chechen', | |
'sag': 'Sango', | |
'hif': 'Fiji Hindi', | |
'ton': 'Tongan', | |
'ngt': 'Ngeq', | |
'kam': 'Kamba', | |
'vec': 'Venetian', | |
'mya': 'Burmese', | |
'gil': 'Gilbertese', | |
'myv': 'Erzya', | |
'niu': 'Niuean', | |
'vro': 'Võro', | |
'glv': 'Manx', | |
'lin': 'Lingala', | |
'lfn': 'Lingua Franca Nova', | |
'pus': 'Pashto', | |
'kjh': 'Khakas', | |
'dng': 'Dungan', | |
'fur': 'Friulian', | |
'mah': 'Marshallese', | |
'pfl': 'Palatine German', | |
'kan': 'Kannada', | |
'crs': 'Seychellois Creole', | |
'gsw': 'Swiss German', | |
'osx': 'Old Saxon', | |
'sux': 'Sumerian', | |
'sco': 'Scots', | |
'moh': 'Mohawk', | |
'ceb': 'Cebuano', | |
'lmo': 'Lombard', | |
'tso': 'Tsonga', | |
'bua': 'Buryat', | |
'aym': 'Aymara', | |
'ilo': 'Ilocano', | |
'kaa': 'Karakalpak', | |
'nlv': 'Orizaba Nahuatl', | |
'ngu': 'Guerrero Nahuatl', | |
'ady': 'Adyghe', | |
'brx': 'Bodo', | |
'gag': 'Gagauz', | |
'rom': 'Romani', | |
'lzz': 'Laz', | |
'fuc': 'Pulaar', | |
'umb': 'Umbundu', | |
'tkl': 'Tokelauan', | |
'sot': 'Southern Sotho', | |
'alt': 'Southern Altai', | |
'war': 'Waray', | |
'snd': 'Sindhi', | |
'tsn': 'Setswana', | |
'srd': 'Sardinian', | |
'pau': 'Palauan', | |
'gbm': 'Garhwali', | |
'oji': 'Ojibwe', | |
'lug': 'Luganda', | |
'hak': 'Hakka Chinese', | |
'bam': 'Bambara', | |
'arg': 'Aragonese', | |
'asm': 'Assamese', | |
'fuv': 'Nigerian Fulfulde', | |
'hoc': 'Ho', | |
'sun': 'Sundanese', | |
'apc': 'North Levantine Arabic', | |
'tyv': 'Tuvinian', | |
'krc': 'Karachay-Balkar', | |
'pap': 'Papiamento', | |
'non': 'Old Norse', | |
'ori': 'Odia (Oriya)', | |
'iba': 'Iban', | |
'oar': 'Old Aramaic', | |
'ary': 'Moroccan Arabic', | |
'cyo': 'Cuyonon', | |
'ibo': 'Igbo', | |
'csb': 'Kashubian', | |
'lou': 'Louisiana Creole', | |
'urh': 'Urhobo', | |
'mvv': 'Tagal Murut', | |
'mdf': 'Moksha', | |
'pag': 'Pangasinan', | |
'cos': 'Corsican', | |
'hnj': 'Hmong Njua (Green)', | |
'rif': 'Tarifit', | |
'nch': 'Central Huasteca Nahuatl', | |
'kek': 'Kekchi', | |
'ssw': 'Swazi', | |
'ban': 'Balinese', | |
'aii': 'Assyrian Neo-Aramaic', | |
'tvl': 'Tuvaluan', | |
'kxi': 'Keningau Murut', | |
'bvy': 'Baybayanon', | |
'mfe': 'Morisyen', | |
'mww': 'Hmong Daw (White)', | |
'bcl': 'Central Bikol', | |
'nau': 'Nauruan', | |
'zlm': 'Malay (Vernacular)', | |
'nst': 'Naga (Tangshang)', | |
'quc': 'K\'iche\'', | |
'afb': 'Gulf Arabic', | |
'min': 'Minangkabau', | |
'tmw': 'Temuan', | |
'cjy': 'Jin Chinese', | |
'mai': 'Maithili', | |
'mad': 'Madurese', | |
'bjn': 'Banjar', | |
'got': 'Gothic', | |
'hsn': 'Xiang Chinese', | |
'gan': 'Gan Chinese', | |
'bar': 'Bavarian', | |
'tzl': 'Talossan', | |
'sgs': 'Samogitian', | |
'ldn': 'Láadan', | |
'dws': 'Dutton World Speedwords', | |
'afh': 'Afrihili', | |
'krl': 'Karelian', | |
'vep': 'Veps', | |
'rue': 'Rusyn', | |
'tah': 'Tahitian', | |
'tly': 'Talysh', | |
'mic': 'Mi\'kmaq', | |
'ext': 'Extremaduran', | |
'swg': 'Swabian', | |
'izh': 'Ingrian', | |
'sma': 'Southern Sami', | |
'jam': 'Jamaican Patois', | |
'mwl': 'Mirandese', | |
'kpv': 'Komi-Zyrian', | |
'cmo': 'Central Mnong', | |
'koi': 'Komi-Permyak', | |
'ike': 'Inuktitut', | |
'kab': 'Kabyle', | |
'run': 'Kirundi', | |
'aln': 'Gheg Albanian', | |
'akl': 'Aklanon', | |
'mnc': 'Manchu', | |
'kas': 'Kashmiri', | |
'otk': 'Old Turkish', | |
'aoz': 'Uab Meto', | |
'shy': 'Tachawit', | |
'fkv': 'Kven Finnish', | |
'rap': 'Rapa Nui', | |
'gcf': 'Guadeloupean Creole French', | |
'gos': 'Gronings', | |
'lij': 'Ligurian', | |
'tig': 'Tigre', | |
'thv': 'Tahaggart Tamahaq', | |
'div': 'Dhivehi', | |
'hrx': 'Hunsrik', | |
'cay': 'Cayuga', | |
'gaa': 'Ga', | |
'cpi': 'Chinese Pidgin English', | |
'bzt': 'Brithenig', | |
'ltg': 'Latgalian', | |
'emx': 'Erromintxela', | |
'gom': 'Konkani (Goan)', | |
'chg': 'Chagatai', | |
'xmf': 'Mingrelian', | |
'osp': 'Old Spanish', | |
'tmr': 'Jewish Babylonian Aramaic', | |
'ryu': 'Okinawan', | |
'evn': 'Evenki', | |
'bis': 'Bislama', | |
'stq': 'Saterland Frisian', | |
'fro': 'Old French', | |
'syc': 'Syriac', | |
'frr': 'North Frisian', | |
'nys': 'Nyungar', | |
'tts': 'Isan', | |
'toi': 'Tonga (Zambezi)', | |
'new': 'Newari', | |
'jpa': 'Jewish Palestinian Aramaic', | |
'phn': 'Phoenician', | |
'rel': 'Rendille', | |
'iii': 'Nuosu', | |
'drt': 'Drents', | |
'laa': 'Southern Subanen', | |
'chn': 'Chinook Jargon', | |
'bal': 'Baluchi', | |
'pli': 'Pali', | |
'hbo': 'Ancient Hebrew', | |
'ajp': 'South Levantine Arabic', | |
'hax': 'Southern Haida', | |
'hdn': 'Northern Haida', | |
'xqa': 'Karakhanid', | |
'crk': 'Plains Cree', | |
'yua': 'Yucatec Maya', | |
'pal': 'Middle Persian (Pahlavi)', | |
'mni': 'Meitei', | |
'ayl': 'Libyan Arabic', | |
'lut': 'Lushootseed', | |
'ofs': 'Old Frisian', | |
'nus': 'Nuer', | |
'ckb': 'Central Kurdish (Soranî)', | |
'kmr': 'Northern Kurdish (Kurmancî)', | |
'sdh': 'Southern Kurdish', | |
'kiu': 'Northern Zaza (Kirmanjki)', | |
'diq': 'Southern Zaza (Dimli)', | |
'zgh': 'Standard Moroccan Tamazight', | |
'bfz': 'Mahasu Pahari', | |
'qxq': 'Qashqai', | |
'klj': 'Khalaj', | |
'dar': 'Dargwa', | |
'lbe': 'Lak', | |
'ava': 'Avar', | |
'mus': 'Muskogee (Creek)', | |
'abq': 'Abaza', | |
'inh': 'Ingush' | |
} | |
exports.languages = languages |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment