-
-
Save gfrlv/a0d6307976d4f5e969972d15e25682a3 to your computer and use it in GitHub Desktop.
find PDF font info with PyPDF2, example code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from PyPDF2 import PdfFileReader | |
from pprint import pprint | |
def walk(obj, fnt, emb): | |
''' | |
If there is a key called 'BaseFont', that is a font that is used in the document. | |
If there is a key called 'FontName' and another key in the same dictionary object | |
that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is | |
embedded. | |
We create and add to two sets, fnt = fonts used and emb = fonts embedded. | |
''' | |
if not hasattr(obj, 'keys'): | |
return None, None | |
fontkeys = set(['/FontFile', '/FontFile2', '/FontFile3']) | |
if '/BaseFont' in obj: | |
fnt.add(obj['/BaseFont']) | |
if '/FontName' in obj: | |
if [x for x in fontkeys if x in obj]:# test to see if there is FontFile | |
emb.add(obj['/FontName']) | |
for k in obj.keys(): | |
walk(obj[k], fnt, emb) | |
return fnt, emb# return the sets for each page | |
if __name__ == '__main__': | |
import sys | |
try: | |
fname = sys.argv[1] | |
except: | |
print("Usage:\n{0} filename.pdf".format(sys.argv[0])) | |
sys.exit(0) | |
pdf = PdfFileReader(fname) | |
fonts = set() | |
embedded = set() | |
for page in pdf.pages: | |
obj = page.getObject() | |
f, e = walk(obj['/Resources'], fonts, embedded) | |
fonts = fonts.union(f) | |
embedded = embedded.union(e) | |
unembedded = fonts - embedded | |
print( 'Font List') | |
pprint(sorted(fonts)) | |
if unembedded: | |
print( '\nUnembedded Fonts') | |
pprint(sorted(unembedded)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment