Skip to content

Instantly share code, notes, and snippets.

@pfeilbr
Last active September 11, 2018 17:12
Show Gist options
  • Save pfeilbr/420eb1765e0e51e82d02c6ce41993a45 to your computer and use it in GitHub Desktop.
Save pfeilbr/420eb1765e0e51e82d02c6ce41993a45 to your computer and use it in GitHub Desktop.
convert a safari book (safaribooksonline.com) to text in the browser
/* convert a safari book (safaribooksonline.com) to text in the browser */
/* must be logged in and on one of the pages in the book */
(async () => {
class Scraper {
sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
tocSelector() {
return '.sbo-toc-thumb'
}
tocLinkElementsSelector() {
return '.tocList a'
}
isTableOfContentsLoaded() {
return document.querySelectorAll(this.tocLinkElementsSelector()).length > 0
}
// TOC html is loaded on demand when user clicks it (ajax). We must load explicitly
// load if not loaded already
async loadTableOfContents() {
const tocSelector = this.tocSelector()
const tocElement = document.querySelector(tocSelector)
tocElement.click()
while (!this.isTableOfContentsLoaded()) {
await this.sleep(250)
}
tocElement.click() // hide/toggle TOC when done
}
// get sections, chapters, parts. this is defined y TOCV
getSections() {
const sections = []
let linkElements = document.querySelectorAll(this.tocLinkElementsSelector())
linkElements.forEach((linkElement) => {
sections.push({
title: linkElement.firstChild.textContent.trim(),
url: linkElement.href
})
})
return sections
}
async fetchURLContent(url) {
const resp = await fetch(url)
return await resp.text()
}
async convertBookToText() {
if (!this.isTableOfContentsLoaded()) {
await this.loadTableOfContents();
}
const sections = this.getSections();
let output = '';
const parser = new DOMParser()
for (let section of sections.slice(0,sections.length-1)) {
const html = await this.fetchURLContent(section.url)
const doc = parser.parseFromString(html, "text/html");
const sectionText = doc.querySelector('#sbo-rt-content').textContent
//console.log(`title: ${section.title}, length: ${sectionText.length}, text: ${sectionText}`)
output = `${output}${sectionText}`
}
return output
}
async bookToText() {
const text = await this.convertBookToText()
console.log(text)
}
}
const scraper = new Scraper();
await scraper.bookToText();
})()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment