Skip to content

Instantly share code, notes, and snippets.

@rw-r-r-0644
Last active December 3, 2020 10:28
Show Gist options
  • Save rw-r-r-0644/b9ca00e0a9160f66985511257dfacd99 to your computer and use it in GitHub Desktop.
Save rw-r-r-0644/b9ca00e0a9160f66985511257dfacd99 to your computer and use it in GitHub Desktop.
#!/bin/python3
from PyPDF2 import PdfFileWriter, PdfFileReader
from Levenshtein import distance
import sys
import re
# Requires PyPDF2 and python-Levenshtein.
# Install them with `pip install PyPDF2 python-Levenshtein
# Maximum distance between the same content located in
# two different slides (accounts for some lib/pdf inconsistencies)
MAX_DISTANCE = 10
infile, outfile = None, None
if len(sys.argv) != 3:
print('usage: cleanup-slides [input.pdf] [output.pdf]')
quit()
try:
infile = open(sys.argv[1], 'rb')
outfile = open(sys.argv[2], 'wb')
except:
print("Failed to open specified files")
quit()
srcpdf = PdfFileReader(infile)
dstpdf = PdfFileWriter()
def detectPageIndex(txt):
index = 1
while txt[-index:].isnumeric():
index += 1
if index == 1 or index > 6:
return False
return txt[-index+1:]
# Process all the pages in the pdf
for i in range(srcpdf.numPages):
pageComplete = True
curpg = srcpdf.getPage(i)
curtxt = re.sub('[^a-zA-Z0-9]+', '', curpg.extractText())
curidx = detectPageIndex(curtxt)
# Unless this is not the last page, check if the page
# is complete and not an intermediate slide.
#
# A page is complete when the next page contains
# completly different text.
# A page is intermediate when the next page contains
# similar starting text to the current slide
if i != srcpdf.numPages - 1:
nextpg = srcpdf.getPage(i + 1)
nexttxt = re.sub('[^a-zA-Z0-9]+', '', nextpg.extractText())
nextidx = detectPageIndex(nexttxt)
if (nextidx != False) and (curidx != False):
pageComplete = curidx != nextidx
else:
# if the starting text of the two slides is close,
# they are probably the same slide
if distance(curtxt, nexttxt[:len(curtxt)]) < MAX_DISTANCE:
pageComplete = False
if pageComplete:
dstpdf.addPage(curpg)
# Write filtered pdf
dstpdf.write(outfile)
infile.close()
outfile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment