Skip to content

Instantly share code, notes, and snippets.

@phunanon
Created June 14, 2024 03:23
Show Gist options
  • Select an option

  • Save phunanon/8645ecb2ab81eee4825d98fb4fd5187f to your computer and use it in GitHub Desktop.

Select an option

Save phunanon/8645ecb2ab81eee4825d98fb4fd5187f to your computer and use it in GitHub Desktop.
Because I'm sick of having to watch Mikha do it by hand! Uses ball-drop technique.
// ==UserScript==
// @name PDF column extractor
// @description extracts PDF pages for Mikha
// @namespace http://tampermonkey.net/
// @version 2024-06-13
// @author Patrick Bowen
// @match https://mozilla.github.io/pdf.js/web/viewer.html
// @grant none
// ==/UserScript==
let hue = 1;
const button = document.createElement("button");
let ySortedPage = [];
let ySiftedPage = [];
let columns = [];
let column = [];
let ys = [];
let stepTimer = null;
function step(x) {
if (!ySortedPage.length) {
columns.push(column);
column = [];
ySortedPage = ySiftedPage;
ySiftedPage = [];
console.log(columns);
stepTimer = null;
button.removeAttribute("disabled");
return;
}
const span = ySortedPage.shift();
const rect = span.getBoundingClientRect();
let y = rect.y;
ys.push(y);
const isObstacle = rect.x < x && (rect.x + rect.width) > x;
if (isObstacle) {
const toLeft = Math.abs(rect.x - x);
const toRight = Math.abs((rect.x + rect.width) - x);
const goThrough = toLeft > 32 && toRight > 32;
if (!goThrough) {
if (toLeft < toRight) {
x -= toLeft;
} else {
x += toRight;
}
}
y += rect.height;
}
if ((rect.x + rect.width / 2) < x) {
column.push(span);
span.style.color = `hsl(${(hue % 6) * 60}, 100%, 50%)`;
} else {
ySiftedPage.push(span);
}
stepTimer = setTimeout(() => step(x), 1);
}
let clickTimer;
const click = ({ clientX }) => {
if (stepTimer !== null) return;
clickTimer = setTimeout(() => {
button.setAttribute("disabled", "disabled");
++hue;
stepTimer = step(clientX);
}, 250);
};
function activate() {
const pageWithDupes = [...document.querySelectorAll("span[role=presentation]")];
pageWithDupes.forEach(span => {
span.style.color = "";
});
//Spacial deduplication
const dedupedPage = (spans => {
const page = [];
spans.forEach(span => {
if (!span.innerText.trim()) return;
const similar = page.some(s => {
if (s.innerText !== span.innerText) return false;
const a = s.getBoundingClientRect();
const b = span.getBoundingClientRect();
const t = 10;
return Math.abs(a.x - b.x) < t && Math.abs(a.y - b.y) < t;
});
if (!similar) page.push(span);
});
return page;
})(pageWithDupes);
ySortedPage = dedupedPage.toSorted((a, b) => {
return a.getBoundingClientRect().y - b.getBoundingClientRect().y;
});
document.body.addEventListener('click', click, true);
button.innerHTML = "Copy to clipboard";
button.removeEventListener('click', activate);
button.addEventListener('click', done);
}
function done() {
clearTimeout(clickTimer);
document.body.removeEventListener('click', click, {capture: true});
button.removeEventListener('click', done);
button.innerHTML = "Activate";
button.addEventListener('click', activate);
//Group ys into rows of 10px each
const px = 10;
const rowYs = [0];
for (const y of ys) {
if (rowYs.at(-1) + px < y) {
rowYs.push(y - 1);
}
}
const rows = rowYs.map(y => {
const cells = [];
for (const column of columns) {
//Find the ideal span in this column for this row Y
const cellIdx = column.findIndex(s => {
const Y = s.getBoundingClientRect().y;
return Y >= y && Y < (y + px);
});
if (cellIdx === -1) {
cells.push("");
} else {
cells.push(column[cellIdx].innerText);
column.splice(cellIdx, 1);
}
}
return cells.join("\t");
});
const doc = rows.join("\n");
navigator.clipboard.writeText(doc);
}
function reset() {
ySortedPage = [];
ySiftedPage = [];
columns = [];
column = [];
ys = [];
button.innerHTML = "Activate";
button.style.position = "absolute";
button.style.left = 0;
button.style.bottom = 0;
button.style.fontSize = "1.5rem";
button.addEventListener('click', activate);
document.body.appendChild(button);
}
(function() {
reset();
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment