Skip to content

Instantly share code, notes, and snippets.

@wf34
Created May 12, 2018 17:16
Show Gist options
  • Save wf34/cd09f61987d8750999a3219d4e4ed936 to your computer and use it in GitHub Desktop.
Save wf34/cd09f61987d8750999a3219d4e4ed936 to your computer and use it in GitHub Desktop.
Web Scraping with JS, Node and puppeteer
const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
function get_browser() {
return puppeteer.launch();
}
function instantiate_page(browser) {
return browser.newPage();
}
class PageLoadError extends Error {
constructor(message, page, address) {
super(message);
this.page = page;
this.address = address;
}
};
function navigate_page(address, page) {
return page.goto(address, { waitUntil: 'networkidle2',
timeout: 240000})
.then(() => Promise.resolve(page))
.catch((err) => Promise.reject(new PageLoadError(err, page, address)));
}
function fetch_page_content(page) {
return page.content()
.then(() => Promise.resolve(page));
}
function restart(error) {
if (error instanceof PageLoadError) {
error.page.close();
return load_page(error.address);
} else {
throw error;
}
}
load_page = function(url) {
return get_browser()
.then(instantiate_page)
.then(navigate_page.bind(null, url))
.then(fetch_page_content)
.catch(restart)
}
function select_html(page) {
return page.$eval(BODY_LYRICS_SELECTOR, (el) => el.innerHTML)
}
function print_some_part_of_it(lyrics_html) {
let $ = cheerio.load(lyrics_html, {decodeEntities: false});
$('body').contents().each(function(i, elm) {
if (elm.tagName === 'a') {
console.log($(elm).text());
}
});
}
load_page('https://genius.com/The-mamas-and-the-papas-california-dreamin-lyrics')
.then(select_html)
.then(print_some_part_of_it)
const BODY_LYRICS_SELECTOR = "body > routable-page > ng-outlet > song-page > div > div > div.song_body.column_layout > div.column_layout-column_span.column_layout-column_span--primary > div > defer-compile:nth-child(2) > lyrics > div > section > p";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment