Created
June 14, 2024 03:23
-
-
Save phunanon/8645ecb2ab81eee4825d98fb4fd5187f to your computer and use it in GitHub Desktop.
Because I'm sick of having to watch Mikha do it by hand! Uses ball-drop technique.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // ==UserScript== | |
| // @name PDF column extractor | |
| // @description extracts PDF pages for Mikha | |
| // @namespace http://tampermonkey.net/ | |
| // @version 2024-06-13 | |
| // @author Patrick Bowen | |
| // @match https://mozilla.github.io/pdf.js/web/viewer.html | |
| // @grant none | |
| // ==/UserScript== | |
| let hue = 1; | |
| const button = document.createElement("button"); | |
| let ySortedPage = []; | |
| let ySiftedPage = []; | |
| let columns = []; | |
| let column = []; | |
| let ys = []; | |
| let stepTimer = null; | |
| function step(x) { | |
| if (!ySortedPage.length) { | |
| columns.push(column); | |
| column = []; | |
| ySortedPage = ySiftedPage; | |
| ySiftedPage = []; | |
| console.log(columns); | |
| stepTimer = null; | |
| button.removeAttribute("disabled"); | |
| return; | |
| } | |
| const span = ySortedPage.shift(); | |
| const rect = span.getBoundingClientRect(); | |
| let y = rect.y; | |
| ys.push(y); | |
| const isObstacle = rect.x < x && (rect.x + rect.width) > x; | |
| if (isObstacle) { | |
| const toLeft = Math.abs(rect.x - x); | |
| const toRight = Math.abs((rect.x + rect.width) - x); | |
| const goThrough = toLeft > 32 && toRight > 32; | |
| if (!goThrough) { | |
| if (toLeft < toRight) { | |
| x -= toLeft; | |
| } else { | |
| x += toRight; | |
| } | |
| } | |
| y += rect.height; | |
| } | |
| if ((rect.x + rect.width / 2) < x) { | |
| column.push(span); | |
| span.style.color = `hsl(${(hue % 6) * 60}, 100%, 50%)`; | |
| } else { | |
| ySiftedPage.push(span); | |
| } | |
| stepTimer = setTimeout(() => step(x), 1); | |
| } | |
| let clickTimer; | |
| const click = ({ clientX }) => { | |
| if (stepTimer !== null) return; | |
| clickTimer = setTimeout(() => { | |
| button.setAttribute("disabled", "disabled"); | |
| ++hue; | |
| stepTimer = step(clientX); | |
| }, 250); | |
| }; | |
| function activate() { | |
| const pageWithDupes = [...document.querySelectorAll("span[role=presentation]")]; | |
| pageWithDupes.forEach(span => { | |
| span.style.color = ""; | |
| }); | |
| //Spacial deduplication | |
| const dedupedPage = (spans => { | |
| const page = []; | |
| spans.forEach(span => { | |
| if (!span.innerText.trim()) return; | |
| const similar = page.some(s => { | |
| if (s.innerText !== span.innerText) return false; | |
| const a = s.getBoundingClientRect(); | |
| const b = span.getBoundingClientRect(); | |
| const t = 10; | |
| return Math.abs(a.x - b.x) < t && Math.abs(a.y - b.y) < t; | |
| }); | |
| if (!similar) page.push(span); | |
| }); | |
| return page; | |
| })(pageWithDupes); | |
| ySortedPage = dedupedPage.toSorted((a, b) => { | |
| return a.getBoundingClientRect().y - b.getBoundingClientRect().y; | |
| }); | |
| document.body.addEventListener('click', click, true); | |
| button.innerHTML = "Copy to clipboard"; | |
| button.removeEventListener('click', activate); | |
| button.addEventListener('click', done); | |
| } | |
| function done() { | |
| clearTimeout(clickTimer); | |
| document.body.removeEventListener('click', click, {capture: true}); | |
| button.removeEventListener('click', done); | |
| button.innerHTML = "Activate"; | |
| button.addEventListener('click', activate); | |
| //Group ys into rows of 10px each | |
| const px = 10; | |
| const rowYs = [0]; | |
| for (const y of ys) { | |
| if (rowYs.at(-1) + px < y) { | |
| rowYs.push(y - 1); | |
| } | |
| } | |
| const rows = rowYs.map(y => { | |
| const cells = []; | |
| for (const column of columns) { | |
| //Find the ideal span in this column for this row Y | |
| const cellIdx = column.findIndex(s => { | |
| const Y = s.getBoundingClientRect().y; | |
| return Y >= y && Y < (y + px); | |
| }); | |
| if (cellIdx === -1) { | |
| cells.push(""); | |
| } else { | |
| cells.push(column[cellIdx].innerText); | |
| column.splice(cellIdx, 1); | |
| } | |
| } | |
| return cells.join("\t"); | |
| }); | |
| const doc = rows.join("\n"); | |
| navigator.clipboard.writeText(doc); | |
| } | |
| function reset() { | |
| ySortedPage = []; | |
| ySiftedPage = []; | |
| columns = []; | |
| column = []; | |
| ys = []; | |
| button.innerHTML = "Activate"; | |
| button.style.position = "absolute"; | |
| button.style.left = 0; | |
| button.style.bottom = 0; | |
| button.style.fontSize = "1.5rem"; | |
| button.addEventListener('click', activate); | |
| document.body.appendChild(button); | |
| } | |
| (function() { | |
| reset(); | |
| })(); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment