Last active
December 3, 2020 10:28
-
-
Save rw-r-r-0644/b9ca00e0a9160f66985511257dfacd99 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/python3 | |
from PyPDF2 import PdfFileWriter, PdfFileReader | |
from Levenshtein import distance | |
import sys | |
import re | |
# Requires PyPDF2 and python-Levenshtein. | |
# Install them with `pip install PyPDF2 python-Levenshtein | |
# Maximum distance between the same content located in | |
# two different slides (accounts for some lib/pdf inconsistencies) | |
MAX_DISTANCE = 10 | |
infile, outfile = None, None | |
if len(sys.argv) != 3: | |
print('usage: cleanup-slides [input.pdf] [output.pdf]') | |
quit() | |
try: | |
infile = open(sys.argv[1], 'rb') | |
outfile = open(sys.argv[2], 'wb') | |
except: | |
print("Failed to open specified files") | |
quit() | |
srcpdf = PdfFileReader(infile) | |
dstpdf = PdfFileWriter() | |
def detectPageIndex(txt): | |
index = 1 | |
while txt[-index:].isnumeric(): | |
index += 1 | |
if index == 1 or index > 6: | |
return False | |
return txt[-index+1:] | |
# Process all the pages in the pdf | |
for i in range(srcpdf.numPages): | |
pageComplete = True | |
curpg = srcpdf.getPage(i) | |
curtxt = re.sub('[^a-zA-Z0-9]+', '', curpg.extractText()) | |
curidx = detectPageIndex(curtxt) | |
# Unless this is not the last page, check if the page | |
# is complete and not an intermediate slide. | |
# | |
# A page is complete when the next page contains | |
# completly different text. | |
# A page is intermediate when the next page contains | |
# similar starting text to the current slide | |
if i != srcpdf.numPages - 1: | |
nextpg = srcpdf.getPage(i + 1) | |
nexttxt = re.sub('[^a-zA-Z0-9]+', '', nextpg.extractText()) | |
nextidx = detectPageIndex(nexttxt) | |
if (nextidx != False) and (curidx != False): | |
pageComplete = curidx != nextidx | |
else: | |
# if the starting text of the two slides is close, | |
# they are probably the same slide | |
if distance(curtxt, nexttxt[:len(curtxt)]) < MAX_DISTANCE: | |
pageComplete = False | |
if pageComplete: | |
dstpdf.addPage(curpg) | |
# Write filtered pdf | |
dstpdf.write(outfile) | |
infile.close() | |
outfile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment