Created
May 12, 2018 17:16
-
-
Save wf34/cd09f61987d8750999a3219d4e4ed936 to your computer and use it in GitHub Desktop.
Web Scraping with JS, Node and puppeteer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer'); | |
const cheerio = require('cheerio'); | |
function get_browser() { | |
return puppeteer.launch(); | |
} | |
function instantiate_page(browser) { | |
return browser.newPage(); | |
} | |
class PageLoadError extends Error { | |
constructor(message, page, address) { | |
super(message); | |
this.page = page; | |
this.address = address; | |
} | |
}; | |
function navigate_page(address, page) { | |
return page.goto(address, { waitUntil: 'networkidle2', | |
timeout: 240000}) | |
.then(() => Promise.resolve(page)) | |
.catch((err) => Promise.reject(new PageLoadError(err, page, address))); | |
} | |
function fetch_page_content(page) { | |
return page.content() | |
.then(() => Promise.resolve(page)); | |
} | |
function restart(error) { | |
if (error instanceof PageLoadError) { | |
error.page.close(); | |
return load_page(error.address); | |
} else { | |
throw error; | |
} | |
} | |
load_page = function(url) { | |
return get_browser() | |
.then(instantiate_page) | |
.then(navigate_page.bind(null, url)) | |
.then(fetch_page_content) | |
.catch(restart) | |
} | |
function select_html(page) { | |
return page.$eval(BODY_LYRICS_SELECTOR, (el) => el.innerHTML) | |
} | |
function print_some_part_of_it(lyrics_html) { | |
let $ = cheerio.load(lyrics_html, {decodeEntities: false}); | |
$('body').contents().each(function(i, elm) { | |
if (elm.tagName === 'a') { | |
console.log($(elm).text()); | |
} | |
}); | |
} | |
load_page('https://genius.com/The-mamas-and-the-papas-california-dreamin-lyrics') | |
.then(select_html) | |
.then(print_some_part_of_it) | |
const BODY_LYRICS_SELECTOR = "body > routable-page > ng-outlet > song-page > div > div > div.song_body.column_layout > div.column_layout-column_span.column_layout-column_span--primary > div > defer-compile:nth-child(2) > lyrics > div > section > p"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment