-
-
Save MTco/a3d42a5160a81b120d451a4bc680508e to your computer and use it in GitHub Desktop.
Kindle Cloud Reader scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// modified based on: | |
// - https://lowrey.me/scraping-a-book-from-kindle-read-amazon-com/ | |
console.clear(); | |
(function() { | |
var hashes = {}; | |
var all = ""; | |
function hashString(str) { | |
let hash = 0; | |
for (let i = 0; i < str.length; i++) { | |
hash += Math.pow(str.charCodeAt(i) * 31, str.length - i); | |
hash = hash & hash; // Convert to 32bit integer | |
} | |
return hash; | |
} | |
function getKindleBookAppFrame() { | |
return document.querySelector("#KindleReaderIFrame").contentDocument; | |
} | |
function turnPage() { | |
return new Promise(resolve => { | |
var appFrame = getKindleBookAppFrame(); | |
appFrame.getElementById("kindleReader_pageTurnAreaRight").click(); | |
setTimeout(resolve, 200); | |
}); | |
} | |
function isHeading(block) { | |
if ( | |
$(block).is("h1") || | |
$(block).is("h2") || | |
$(block).is("h3") || | |
$(block).is("h4") | |
) { | |
return true; | |
} | |
$(block) | |
.contents() | |
.each(() => { | |
if ( | |
$(this).is("h1") || | |
$(this).is("h2") || | |
$(this).is("h3") || | |
$(this).is("h4") | |
) { | |
return true; | |
} | |
}); | |
return false; | |
} | |
function getContentFramesSubElements(contentFrames) { | |
return Array.from( | |
contentFrames[currentContentFrameIndex].querySelectorAll( | |
"\ | |
body > div, body > h1, body > h2, body > h3, body > h4, body > h5, body > h6, \ | |
body > ol, body > ul, body > li \ | |
" | |
) | |
); | |
} | |
function scrapeFrames() { | |
return new Promise(resolve => { | |
frames = []; | |
var appFrame = getKindleBookAppFrame(); | |
var contentFrames = Array.from(appFrame.querySelectorAll("iframe")).map( | |
f => f.contentDocument | |
); | |
var frameBody = $("iframe") | |
.contents() | |
.find("iframe") | |
.contents() | |
.find("body") | |
.get(1); | |
// console.log(frameBody); | |
let hash = hashString(frameBody.innerText); | |
if (hashes[hash] === undefined) { | |
hashes[hash] = true; | |
frames.push(frameBody.innerHTML); | |
} | |
resolve(frames); | |
}); | |
} | |
function formatFrames(frames) { | |
console.log("unformatted"); | |
console.log(frames); | |
return new Promise(resolve => { | |
formattedFrames = []; | |
for (let i = 0; i < frames.length; i++) { | |
const frame = frames[i]; | |
let formattedFrame = { | |
is_heading: false, | |
is_list_item: false, | |
text: null | |
}; | |
// check if frame contains a unordered/ordered list | |
if ($(frame).is("ul")) { | |
// break up list further | |
let ulTexts = Array.from($(frame).contents()) | |
.map(el => { | |
return $(el).text(); | |
}) | |
.filter(el => { | |
return el.trim().length != 0; | |
}); | |
ulTexts.forEach(text => { | |
formattedFrame["text"] = text; | |
formattedFrame["is_list_item"] = true; | |
formattedFrames.push(formattedFrame); | |
}); | |
} else if ($(frame).is("ol")) { | |
let olTexts = Array.from($(frame).contents()) | |
.map(el => { | |
return $(el).text(); | |
}) | |
.filter(el => { | |
return el.trim().length != 0; | |
}); | |
olTexts.forEach(text => { | |
formattedFrame["text"] = text; | |
formattedFrame["is_list_item"] = true; | |
formattedFrames.push(formattedFrame); | |
}); | |
console.log(formattedFrames); | |
} else { | |
// otherwise, treat as paragraph | |
const text = $(frame).text(); | |
if (text.trim().length == 0) { | |
continue; | |
} | |
formattedFrame["is_heading"] = isHeading(frame); | |
formattedFrame["text"] = $(frame).text(); | |
formattedFrames.push(formattedFrame); | |
} | |
} | |
resolve(formattedFrames); | |
}); | |
} | |
function getFormattedFrames() { | |
// return a promise with an array of formatted scraped content | |
return scrapeFrames().then(formatFrames); | |
} | |
function hasReachedEndSample() { | |
var appFrame = getKindleBookAppFrame(); | |
var endSampleMessageDiv = appFrame.getElementById( | |
"kindle_sample_end_message" | |
); | |
return $(endSampleMessageDiv).is(":visible"); | |
} | |
function hasReachedEnd() { | |
var appFrame = getKindleBookAppFrame(); | |
return appFrame | |
.getElementById("kindleReader_footer") | |
.innerText.includes("100%"); | |
} | |
function done() { | |
console.log(all); | |
} | |
function scrape() { | |
setTimeout(() => { | |
scrapeFrames() | |
.then(frames => { | |
console.log("."); | |
all += frames.join("\n"); | |
}) | |
.then(turnPage) | |
.then(() => { | |
if (hasReachedEndSample() || hasReachedEnd()) { | |
done(); | |
} else { | |
scrape(); | |
} | |
}); | |
}, 800); | |
} | |
scrape(); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
My guess would be no... Granted I only spent about 5 minutes looking today, but I tried searching for words from the text I was reading in the source and none of the text seemed to be in the rendered page at all. The ajax responses on turning the page seemed to contain long and detailed instructions on how to draw the text.