Created
March 22, 2023 19:51
-
-
Save rileymjohnson/a1175752bac7c84e586d77186e6b3b96 to your computer and use it in GitHub Desktop.
Extra the letters of a PDF document and their positions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
When loading the PDF document, you need to set the `fontExtraProperties` parameter to `true`. e.g. | |
getDocument({ | |
..., | |
fontExtraProperties: true | |
}) | |
*/ | |
var generateId = () => { | |
return ( | |
Date.now().toString(36) + | |
Math.floor( | |
Math.pow(10, 12) + Math.random() * 9 * Math.pow(10, 12) | |
).toString(36) | |
) | |
} | |
function itemToRect(item, viewport, heightScale=1) { | |
const transformed = Util.transform( | |
viewport.transform, | |
item.transform | |
) | |
let [left, top] = Util.applyTransform( | |
[0, 1], | |
transformed | |
) | |
let width = Util.applyTransform( | |
[item.width, 0], | |
viewport.transform | |
)[0] | |
let height = Util.applyTransform( | |
[item.height, 0], | |
viewport.transform | |
)[0] | |
left /= viewport.width | |
width /= viewport.width | |
top /= viewport.height | |
height /= viewport.height | |
height *= heightScale | |
return { left, top, width, height } | |
} | |
function* itemToRects(item, page, heightScale) { | |
const { viewport } = viewer.getPageView(page.pageNumber - 1) | |
const bbox = { | |
...itemToRect(item, viewport, heightScale), | |
page: page.pageNumber | |
} | |
const itemFont = page.commonObjs.get(item.fontName) | |
const letterItems = [] | |
let itemFontWidth = 0 | |
let itemText = item.str | |
if (item.hasEOL) { | |
itemText += '\n' | |
} | |
for (const letter of itemText) { | |
const fontWidth = itemFont.widths[ | |
letter.charCodeAt() | |
] || 0 | |
letterItems.push({ | |
letter, | |
fontWidth, | |
leftOffset: itemFontWidth | |
}) | |
itemFontWidth += fontWidth | |
} | |
for (const letterItem of letterItems) { | |
const { letter, fontWidth, leftOffset } = letterItem | |
let left = 0 | |
let width = 0 | |
if (itemFontWidth > 0) { | |
left = leftOffset / itemFontWidth * bbox.width + bbox.left | |
width = fontWidth / itemFontWidth * bbox.width | |
} | |
yield { | |
letter, | |
left, | |
width, | |
page: bbox.page, | |
top: bbox.top, | |
height: bbox.height, | |
} | |
} | |
} | |
async function getLetterRects(page) { | |
const { items } = await page.getTextContent() | |
const formattedLetterRects = [] | |
for (const item of items) { | |
const letterRects = itemToRects(item, page, heightScale=1.25) | |
for (const letterRect of letterRects) { | |
const { letter, ...rect } = letterRect | |
formattedLetterRects.push({ | |
id: generateId(), | |
text: letter, | |
position: { | |
page: page.pageNumber, | |
rects: [rect] | |
} | |
}) | |
} | |
} | |
return formattedLetterRects | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment