Skip to content

Instantly share code, notes, and snippets.

@akesterson
Forked from depau/1-README.md
Last active November 9, 2024 05:26
Show Gist options
  • Save akesterson/a1b17c18b086f63eac492a22d4448d21 to your computer and use it in GitHub Desktop.
Save akesterson/a1b17c18b086f63eac492a22d4448d21 to your computer and use it in GitHub Desktop.
VitalSource web book page scraper

VitalSource web book page scraper

This pair of userscripts (to be used with any usescript manager such as ViolentMonkey) allow scraping books from the VitalSource Bookshelf web reader.

This allows creating a PDF for offline reading with free-software readers such as Calibre.

The "inner" script hooks into the book page nested iframe; it detects when a new page image is loaded and it automatically starts a browser download.

The "outer" script hooks into the main reader page and adds a "Scrape" button which automatically goes to the next page when the inner script has successfully downloaded a page.

Usage

The script appears to work better on Chrome rather than on Firefox, likely due to Firefox's tracking protections. If you're having trouble, run it on an empty Chrome profile with no extensions.

  1. Install both userscripts
  2. Open the book
  3. Wait for the first two downloads (the web reader will prefetch one page in the background while you read)
  4. Click "Scrape this motherfucker" to start auto-advancing
  5. Keep an eye on the reader as it may occasionally present a CAPTCHA; the scraping should resume once you solve it

Additional tools

// ==UserScript==
// @name Scrape VitalSource (inner)
// @namespace http://tampermonkey.net/
// @version 2024-02-16
// @description try to take over the world!
// @author You
// @match https://jigsaw.vitalsource.com/books/*
// @icon https://www.google.com/s2/favicons?sz=64&domain=vitalsource.com
// @sandbox JavaScript
// @grant unsafeWindow
// @grant GM_download
// ==/UserScript==
function download(url) {
console.log("inner frame: downloading:", url);
GM_download({
url: url,
name: "page.jpg",
saveAs: false,
conflictAction: "uniquify",
onerror: function (error) {
console.error("inner frame: download failed:", url, error);
},
onload: function (response) {
console.log("inner frame: downloaded:", url);
window.top.postMessage(
{
type: "pageImage",
frameUrl: window.location.href,
url: url,
},
"https://online.vitalsource.com/reader/books/*"
);
console.log("inner frame: message sent");
},
});
}
(function () {
let lowResCount = 0;
const run = function () {
const b = document.querySelector('img#pbk-page');
if (b == null) {
console.log("inner frame: image not found");
setTimeout(run, 100);
} else {
const url = b.src;
if (url.endsWith("/800")) {
lowResCount++;
if (lowResCount < 25) {
console.log("inner frame: low res image, retrying:", url);
setTimeout(run, 500);
return;
} else {
console.log("inner frame: low res image, download anyway:", url);
}
}
console.log("inner frame: found image:", url);
download(url);
}
};
run();
})();
// ==UserScript==
// @name Scrape VitalSource
// @namespace http://tampermonkey.net/
// @version 2024-02-16
// @description try to take over the world!
// @author You
// @match https://online.vitalsource.com/reader/books/*
// @icon https://www.google.com/s2/favicons?sz=64&domain=vitalsource.com
// @grant window.onurlchange
// @grant unsafeWindow
// ==/UserScript==
function findElementByLabelText(labelText) {
// Find all label elements in the document
const labels = document.querySelectorAll('label');
// Iterate through the found labels to find the one with the matching text
for (let label of labels) {
if (label.textContent.trim() === labelText) {
// Read the "for" attribute of the label
const forAttribute = label.getAttribute('for');
if (forAttribute) {
// Use the "for" attribute to fetch the referenced element
return document.getElementById(forAttribute); // Return the found element
}
break; // Stop the search once the first matching label is found
}
}
return null; // Return null if no matching label or referenced element is found
}
(function () {
let lastUrl = null;
let lastUrlChangedCallback = null;
let currentPage = -1;
let running = false;
function stop() {
running = false;
lastUrlChangedCallback = null;
currentPage = -1;
const button = document.querySelector("#scrapeButton");
button.innerHTML = "Scrape this motherfucker";
}
window.addEventListener("message", e => {
// Check if e.data contains .type, and .type===pageImage
if (e.data.type !== "pageImage") return;
console.log("Page image event:", e);
unsafeWindow.msg = e;
lastUrl = e.data;
if (lastUrlChangedCallback) {
lastUrlChangedCallback();
}
});
const worker = function () {
const goToPageInput = findElementByLabelText('Go to Page');
if (goToPageInput == null) {
console.log("Go to page input not found!");
return;
}
const pageNum = parseInt(goToPageInput.value);
if (pageNum === currentPage) {
console.log("Skipping duplicate event");
return;
}
if (currentPage === -1) {
currentPage = pageNum;
if (currentPage !== 1) {
// Ask permission to start from a page other than the first
if (!confirm("Do you want to start from page " + currentPage + "?")) {
console.log("User cancelled");
stop();
return;
}
}
}
if (lastUrl == null) {
console.log("No URL found");
stop();
return;
}
console.log("SCRAPER:", lastUrl.url);
setTimeout(() => {
const nextButton = document.querySelector('[aria-label="Next"]');
if (nextButton == null) {
console.log("Next button not found!");
stop();
return;
}
// check if disabled via the "disabled" attribute
if (nextButton.hasAttribute("disabled")) {
console.log("Next button is disabled");
stop();
return;
}
nextButton.click();
}, 10);
}
const doStuff = function () {
if (running) {
console.log("Stopping scraper");
stop();
return;
}
console.log("Starting scraper, lastUrl:", lastUrl);
const button = document.querySelector("#scrapeButton");
running = true;
lastUrlChangedCallback = worker;
button.innerHTML = "Stop scraping";
worker();
};
const inject = function () {
const b = document.querySelector('[aria-label="Search across book"]');
if (b == null) {
console.log("Button not found, retrying in 1s");
setTimeout(inject, 1000);
return;
}
const a = document.createElement("a");
a.id = "scrapeButton";
a.href = "#";
a.innerHTML = "Scrape this motherfucker";
a.addEventListener("click", _ => doStuff());
b.after(a);
console.log("Injected!");
};
inject();
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment