Last active
November 21, 2023 09:18
-
-
Save paulgrammer/dd63c4e2f7a9933be7d2a85a2ccf2154 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const axios = require("axios"); | |
const cheerio = require("cheerio"); | |
const fs = require("fs"); | |
function getWords(q) { | |
console.log("Requesting words"); | |
return axios | |
.get("https://iapi.glosbe.com/iapi3/wordlist", { | |
params: { | |
l1: "lg", | |
l2: "en", | |
q: q, | |
after: 30, | |
before: 0, | |
env: "en", | |
}, | |
}) | |
.then((response) => response.data.after.map(({ phrase }) => phrase)) | |
.catch(() => []); | |
} | |
function getCorpus(q, page) { | |
return axios | |
.get(`https://glosbe.com/lg/en/${q}/fragment/tmem`, { | |
params: { | |
page, | |
mode: "MUST", | |
stem: false, | |
includedAuthors: "", | |
excludedAuthors: "", | |
}, | |
}) | |
.then((response) => { | |
console.log(response.request.res.responseUrl); | |
let left = []; | |
let right = []; | |
if (response.data.includes("No examples found")) return "no-more"; | |
const $ = cheerio.load(response.data); | |
let nodes = $(".tmem__item"); | |
let format = (text) => | |
text | |
.replace(/&/g, "and") | |
.replace(new RegExp('<strong class="keyword">', "ig"), "") | |
.replace(new RegExp("</strong>", "gi"), "") | |
.replace(new RegExp("<div>", "gi"), "") | |
.replace(new RegExp("</div>", "gi"), "") | |
.replace(/\+/g, "") | |
.replace(/\*/g, "") | |
.replace(new RegExp("• ", "ig"), "") | |
.trim(); | |
nodes.each(function (_, node) { | |
let lg = $(node).find("div[lang=lg]").html(); | |
let en = $(node).find(".relative").html(); | |
right.push(format(en)); | |
left.push(format(lg)); | |
}); | |
return { text: right, translations: left }; | |
}) | |
.catch((err) => { | |
console.log(err.message); | |
return "errors"; | |
}); | |
} | |
function fetchCorpus(phrase) { | |
return new Promise((done) => { | |
let currentPage = 1; | |
let text = []; | |
let translations = []; | |
let run = async () => { | |
let result = await getCorpus(phrase, currentPage); | |
if (["no-more", "errors"].includes(result)) { | |
return done({ text, translations }); | |
} | |
text = text.concat(result.text); | |
translations = translations.concat(result.translations); | |
currentPage++; | |
run(); | |
}; | |
run(); | |
}); | |
} | |
function fetchWords() { | |
return new Promise((done) => { | |
let output = []; | |
let letters = "abcdefghijklmnopqrstuvwxyz".split(""); | |
let run = async () => { | |
let letter = letters.shift(); | |
if (!letter) { | |
return done([...new Set(output)]); | |
} | |
let words = await getWords(letter); | |
output = output.concat(words); | |
run(); | |
}; | |
run(); | |
}); | |
} | |
function getDone() { | |
return fs.readFileSync("./done.txt").toString().split("\n"); | |
} | |
function setDone(word) { | |
let words = getDone(); | |
words.push(word); | |
fs.writeFileSync("./done.txt", words.join("\n")); | |
} | |
async function main(input = "words") { | |
let words = []; | |
if (fs.existsSync(`./${input}.txt`)) { | |
words = fs.readFileSync(`./${input}.txt`).toString().split("\n"); | |
} else { | |
words = await fetchWords(); | |
} | |
let next = async () => { | |
let word = words.shift(); | |
if (!word) { | |
return; | |
} | |
let id = word.replace(/\s/g, "_"); | |
let done = getDone(); | |
if (done.includes(id)) { | |
return next(); | |
} | |
let { text, translations } = await fetchCorpus(word.replace(/\?/g, "%3F")); | |
if (text.length && translations.length) { | |
if (!fs.existsSync(`./${input}`)) { | |
fs.mkdirSync(`./${input}`); | |
} | |
fs.writeFileSync( | |
`./${input}/data.corpus.www.glosbe.com.jw2019.${id}.en.txt`, | |
text.join("\n") | |
); | |
fs.writeFileSync( | |
`./${input}/data.corpus.www.glosbe.com.jw2019.${id}.lg.txt`, | |
translations.join("\n") | |
); | |
} | |
setDone(id); | |
next(); | |
}; | |
next(); | |
} | |
main("continents"); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment