Skip to content

Instantly share code, notes, and snippets.

@jargnar
Created January 30, 2021 17:51
Show Gist options
  • Save jargnar/987f726ba879d53b83895698326e830f to your computer and use it in GitHub Desktop.
Save jargnar/987f726ba879d53b83895698326e830f to your computer and use it in GitHub Desktop.
// MIT
// Suhas Guruprasad, [email protected]
const puppeteer = require("puppeteer");
const fetch = require("node-fetch");
const imagemin = require("imagemin");
const imageminMozjpeg = require("imagemin-mozjpeg");
const fs = require("fs").promises;
function articleHrefsFromHomePage() {
const NEWS = 'https://www.berlin.de/en/news/';
const PAGE = 'html';
const hrefs = Array.from(document.querySelectorAll('article a'), e => e.href);
return [...new Set(hrefs.filter(a => a.startsWith(NEWS) && a.endsWith(PAGE)))];
}
function articleTitle() {
return document.querySelector('h1').innerText;
}
function articleBody() {
return Array.from(document.querySelectorAll('div.block.paragraph'), e =>
e.innerText.replace(/\r?\n|\r/g, "").replace("+", " ")).join(' ');
}
function articleImg() {
const link = document.querySelector('.main-content img').getAttribute('src');
return `https://www.berlin.de${link}`;
}
function articleDate() {
return document.querySelector('p.source.date').innerText.split(':')[1].replace('Uhr', '').trim().slice(0, -2);
}
async function compressedBase64ImgFromURI(uri) {
const response = await fetch(uri);
const buffer = await response.buffer();
const filename = `${uri.replace(/[^0-9A-Z]+/gi,'')}.jpg`;
await fs.writeFile(filename, buffer);
await compressImg(filename);
const base64 = await fs.readFile(filename, {encoding: 'base64'});
return base64;
}
async function compressImg(filename) {
await imagemin([filename], {
destination: '.',
plugins: [imageminMozjpeg({quality: 5})]
});
}
async function scrapeHomePage() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.berlin.de/en/news/');
const articleHrefs = await page.evaluate(articleHrefsFromHomePage);
for (const href of articleHrefs) {
await page.goto(href);
const title = await page.evaluate(articleTitle);
const body = await page.evaluate(articleBody);
const imgSrc = await page.evaluate(articleImg);
const dt = await page.evaluate(articleDate);
const base64 = await compressedBase64ImgFromURI(imgSrc);
console.log(title);
console.log(dt);
}
await browser.close();
}
scrapeHomePage();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment