Skip to content

Instantly share code, notes, and snippets.

@limitedeternity
Last active February 14, 2020 08:21
Show Gist options
  • Save limitedeternity/57faca46016d39712a4d47d8fdee9267 to your computer and use it in GitHub Desktop.
Save limitedeternity/57faca46016d39712a4d47d8fdee9267 to your computer and use it in GitHub Desktop.
LinguaLeo reader scraper
from json import load
def fn(tr, out):
tr_data = load(tr)
for line_list in tr_data:
tmp = list(filter(lambda line: line[0].isupper(), line_list))
if len(tmp) == 0:
tmp = line_list
out.write(" " + tmp[0])
with open("Письм.txt", "w") as out1:
for i in range(10):
with open(f"translation ({i}).json", "r") as tr:
fn(tr, out1)
with open("Устн.txt", "w") as out2:
for i in range(10, 13):
with open(f"translation ({i}).json", "r") as tr:
fn(tr, out2)
let globalCharCounter = 0;
let globalPageCounter = 0;
const saveJSON = (function () {
let a = document.createElement("a");
document.body.appendChild(a);
a.style = "display: none";
return function (data, fileName) {
var json = JSON.stringify(data, null, 4),
blob = new Blob([json], {type: "application/json"}),
url = window.URL.createObjectURL(blob);
a.href = url;
a.download = fileName;
a.click();
window.URL.revokeObjectURL(url);
};
}());
const saveText = (function () {
let a = document.createElement("a");
document.body.appendChild(a);
a.style = "display: none";
return function (data, fileName) {
var blob = new Blob([data], {type: "text/plain"}),
url = window.URL.createObjectURL(blob);
a.href = url;
a.download = fileName;
a.click();
window.URL.revokeObjectURL(url);
};
}());
async function scrap() {
let nodelist = document.querySelectorAll(".ll-translatable-text__sentence");
let textdata = Array.from(nodelist).map(node => node.innerText);
let translation = (await Promise.all(
textdata.map(sentence => {
return fetch("https://api.lingualeo.com/getTranslates", {
method: "POST",
headers: {"Content-Type": "application/json"},
body: JSON.stringify({apiVersion: "1.0.1", text: sentence.trim(), ctx: {config: {isCheckData: true, isLogging: true}}})
}).then(req => req.json()).then(json => json["translate"]);
})
)).map(lst => lst.map(tr => tr["value"]));
globalPageCounter += 1;
globalCharCounter += textdata.join("").replace(/\s/g, "").length;
saveText(textdata.join(""), "text.txt");
saveJSON(translation, "translation.json");
document.querySelector(".ll-leokit__paginator__page__m-next").click();
}
(async function run() {
document.querySelector("h2").remove();
let borderReached = false;
while (globalCharCounter < 35000) {
if (globalCharCounter >= 25000 && !borderReached) {
borderReached = true;
console.log(`Border reached at page ${globalPageCounter}!`);
}
await scrap();
window.alert("Save and click \"OK\"");
while (document.querySelectorAll(".ll-translatable-text__sentence").length === 0) {
await new Promise(resolve => setTimeout(resolve, 100));
}
}
console.log("Done!");
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment