Last active
July 14, 2022 08:37
-
-
Save emwadde/1c26556f079f59511026d1b758dae250 to your computer and use it in GitHub Desktop.
Using browser console as a webscrapping tool
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fetchText = async (url) => { | |
return new Promise((resolve, reject) => { | |
fetch(url) | |
.then(response => { | |
if(response.status == 200) return response.text(); | |
throw new Error(response.statusText) | |
}) | |
.then(html => { | |
let parser = new DOMParser(); | |
let doc = parser.parseFromString(html, 'text/html'); | |
let paras = doc.querySelectorAll(".article-content"); | |
console.log(`${url} => ${paras.length} paragraphs`); | |
let text = [...paras].map(p =>p.textContent).join("\n") | |
resolve({url: url, text: text}) | |
}) | |
.catch(err => reject({url: url, error: err.message})) | |
}) | |
} | |
var downloadTextFile = (filename, text) => { | |
const element = document.createElement("a"); | |
const file = new Blob([text], { | |
type: "text/plain", | |
}); | |
element.href = URL.createObjectURL(file); | |
element.download = filename; | |
document.body.appendChild(element); | |
element.click(); | |
}; | |
var urls = [ | |
"https://archive.mv/en/articles/AdK5q", | |
"https://archive.mv/en/articles/QMDK1", | |
"https://archive.mv/en/articles/404-error", | |
"https://archive.mv/en/articles/8OLk9" | |
] | |
var fetchPromises = urls.map(fetchText); | |
Promise.allSettled(fetchPromises) | |
.then(results =>{ | |
let fetchedText = results.filter(result => result.status === "fulfilled").map(result => result.value.text).join("\n") | |
let failed = results.filter(result => result.status == "rejected").map(result => `FAILED: ${result.reason.url}`) | |
let allText = [fetchedText, failed].join("\n------------------\n") | |
downloadTextFile("all_text.txt", allText) | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment