Skip to content

Instantly share code, notes, and snippets.

@tiarno
Last active February 17, 2017 13:33
Show Gist options
  • Save tiarno/dea01f70a54cac52f6a6 to your computer and use it in GitHub Desktop.
Save tiarno/dea01f70a54cac52f6a6 to your computer and use it in GitHub Desktop.
PDF Checking

PDF Testing Gist

These two files, pdf_linkchecker.py and pdf_fontchecker.py are code examples to go along with a blog article: http://reachtim.com/articles/PDF-Testing.html

See the article for details on how to test your PDFs for broken internal and external links and for unembedded fonts.

'''
Gist to accompany blog artice:
http://reachtim.com/articles/PDF-Testing.html
'''
from PyPDF2 import PdfFileReader
from pprint import pprint
import sys
fontkeys = set(['/FontFile', '/FontFile2', '/FontFile3'])
def walk(obj, fnt, emb):
if '/BaseFont' in obj:
fnt.add(obj['/BaseFont'])
elif '/FontName' in obj and fontkeys.intersection(set(obj)):
emb.add(obj['/FontName'])
for k in obj:
if hasattr(obj[k], 'keys'):
walk(obj[k], fnt, emb)
return fnt, emb
if __name__ == '__main__':
fname = sys.argv[1]
pdf = PdfFileReader(fname)
fonts = set()
embedded = set()
for page in pdf.pages:
obj = page.getObject()
f, e = walk(obj['/Resources'], fonts, embedded)
fonts = fonts.union(f)
embedded = embedded.union(e)
unembedded = fonts - embedded
print 'Font List'
pprint(sorted(list(fonts)))
if unembedded:
print '\nUnembedded Fonts'
pprint(unembedded)
'''
Gist to accompany blog artice:
http://reachtim.com/PDF-Testing.html
'''
from PyPDF2 import PdfFileReader
import requests
import sys
import urllib
def check_ftp(url):
try:
response = urllib.urlopen(url)
except IOError as e:
result, reason = False, e
else:
if response.read():
result, reason = True, 'okay'
else:
result, reason = False, 'Empty Page'
return result, reason
def check_url(url, auth=None):
headers = {'User-Agent': 'Mozilla/5.0', 'Accept': '*/*'}
if url.startswith('ftp://'):
result, reason = check_ftp(url)
else:
try:
response = requests.get(url, timeout=6, auth=auth, headers=headers)
except (requests.ConnectionError,
requests.HTTPError,
requests.Timeout) as e:
result, reason = False, e
else:
if response.text:
result, reason = response.status_code, response.reason
else:
result, reason = False, 'Empty Page'
return result, reason
def check_pdf(pdf):
links = list()
urls = list()
badurls = list()
for page in pdf.pages:
obj = page.getObject()
for annot in [x.getObject() for x in obj.get('/Annots', [])]:
dst = annot['/A'].get('/D')
url = annot['/A'].get('/URI')
if dst:
links.append(dst)
elif url:
urls.append(url)
result, reason = check_url(url)
if not result:
badurls.append({'url':url, 'reason': '%r' % reason})
anchors = pdf.namedDestinations.keys()
badlinks = [x for x in links if x not in anchors]
return links, badlinks, urls, badurls
if __name__ == '__main__':
fname = sys.argv[1]
print 'Checking %s' % fname
pdf = PdfFileReader(fname)
links, badlinks, urls, badurls = check_pdf(pdf)
print 'urls: ', urls
print
print 'bad links: ', badlinks
print
print 'bad urls: ',badurls
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment