Created
January 30, 2021 17:51
-
-
Save jargnar/987f726ba879d53b83895698326e830f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// MIT | |
// Suhas Guruprasad, [email protected] | |
const puppeteer = require("puppeteer"); | |
const fetch = require("node-fetch"); | |
const imagemin = require("imagemin"); | |
const imageminMozjpeg = require("imagemin-mozjpeg"); | |
const fs = require("fs").promises; | |
function articleHrefsFromHomePage() { | |
const NEWS = 'https://www.berlin.de/en/news/'; | |
const PAGE = 'html'; | |
const hrefs = Array.from(document.querySelectorAll('article a'), e => e.href); | |
return [...new Set(hrefs.filter(a => a.startsWith(NEWS) && a.endsWith(PAGE)))]; | |
} | |
function articleTitle() { | |
return document.querySelector('h1').innerText; | |
} | |
function articleBody() { | |
return Array.from(document.querySelectorAll('div.block.paragraph'), e => | |
e.innerText.replace(/\r?\n|\r/g, "").replace("+", " ")).join(' '); | |
} | |
function articleImg() { | |
const link = document.querySelector('.main-content img').getAttribute('src'); | |
return `https://www.berlin.de${link}`; | |
} | |
function articleDate() { | |
return document.querySelector('p.source.date').innerText.split(':')[1].replace('Uhr', '').trim().slice(0, -2); | |
} | |
async function compressedBase64ImgFromURI(uri) { | |
const response = await fetch(uri); | |
const buffer = await response.buffer(); | |
const filename = `${uri.replace(/[^0-9A-Z]+/gi,'')}.jpg`; | |
await fs.writeFile(filename, buffer); | |
await compressImg(filename); | |
const base64 = await fs.readFile(filename, {encoding: 'base64'}); | |
return base64; | |
} | |
async function compressImg(filename) { | |
await imagemin([filename], { | |
destination: '.', | |
plugins: [imageminMozjpeg({quality: 5})] | |
}); | |
} | |
async function scrapeHomePage() { | |
const browser = await puppeteer.launch(); | |
const page = await browser.newPage(); | |
await page.goto('https://www.berlin.de/en/news/'); | |
const articleHrefs = await page.evaluate(articleHrefsFromHomePage); | |
for (const href of articleHrefs) { | |
await page.goto(href); | |
const title = await page.evaluate(articleTitle); | |
const body = await page.evaluate(articleBody); | |
const imgSrc = await page.evaluate(articleImg); | |
const dt = await page.evaluate(articleDate); | |
const base64 = await compressedBase64ImgFromURI(imgSrc); | |
console.log(title); | |
console.log(dt); | |
} | |
await browser.close(); | |
} | |
scrapeHomePage(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment