Last active
September 11, 2018 17:12
-
-
Save pfeilbr/420eb1765e0e51e82d02c6ce41993a45 to your computer and use it in GitHub Desktop.
convert a safari book (safaribooksonline.com) to text in the browser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* convert a safari book (safaribooksonline.com) to text in the browser */ | |
/* must be logged in and on one of the pages in the book */ | |
(async () => { | |
class Scraper { | |
sleep(ms) { | |
return new Promise(resolve => setTimeout(resolve, ms)); | |
} | |
tocSelector() { | |
return '.sbo-toc-thumb' | |
} | |
tocLinkElementsSelector() { | |
return '.tocList a' | |
} | |
isTableOfContentsLoaded() { | |
return document.querySelectorAll(this.tocLinkElementsSelector()).length > 0 | |
} | |
// TOC html is loaded on demand when user clicks it (ajax). We must load explicitly | |
// load if not loaded already | |
async loadTableOfContents() { | |
const tocSelector = this.tocSelector() | |
const tocElement = document.querySelector(tocSelector) | |
tocElement.click() | |
while (!this.isTableOfContentsLoaded()) { | |
await this.sleep(250) | |
} | |
tocElement.click() // hide/toggle TOC when done | |
} | |
// get sections, chapters, parts. this is defined y TOCV | |
getSections() { | |
const sections = [] | |
let linkElements = document.querySelectorAll(this.tocLinkElementsSelector()) | |
linkElements.forEach((linkElement) => { | |
sections.push({ | |
title: linkElement.firstChild.textContent.trim(), | |
url: linkElement.href | |
}) | |
}) | |
return sections | |
} | |
async fetchURLContent(url) { | |
const resp = await fetch(url) | |
return await resp.text() | |
} | |
async convertBookToText() { | |
if (!this.isTableOfContentsLoaded()) { | |
await this.loadTableOfContents(); | |
} | |
const sections = this.getSections(); | |
let output = ''; | |
const parser = new DOMParser() | |
for (let section of sections.slice(0,sections.length-1)) { | |
const html = await this.fetchURLContent(section.url) | |
const doc = parser.parseFromString(html, "text/html"); | |
const sectionText = doc.querySelector('#sbo-rt-content').textContent | |
//console.log(`title: ${section.title}, length: ${sectionText.length}, text: ${sectionText}`) | |
output = `${output}${sectionText}` | |
} | |
return output | |
} | |
async bookToText() { | |
const text = await this.convertBookToText() | |
console.log(text) | |
} | |
} | |
const scraper = new Scraper(); | |
await scraper.bookToText(); | |
})() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment