rw-r-r-0644 · December 3, 2020 10:28
diff --git a/cleanup-slides.py b/cleanup-slides.py
 #!/bin/python3
 from PyPDF2 import PdfFileWriter, PdfFileReader
 from Levenshtein import distance
 import sys
 import re

 # Requires PyPDF2 and python-Levenshtein.	
 # Install them with `pip install PyPDF2 python-Levenshtein

 # Maximum distance between the same content located in
 # two different slides (accounts for some lib/pdf inconsistencies)
 MAX_DISTANCE = 10


 infile, outfile = None, None

 if len(sys.argv) != 3:
 	print('usage: cleanup-slides [input.pdf] [output.pdf]')
 	quit()

 try:
 	infile = open(sys.argv[1], 'rb')
 	outfile = open(sys.argv[2], 'wb')
 except:
 	print("Failed to open specified files")
 	quit()

 srcpdf = PdfFileReader(infile)
 dstpdf = PdfFileWriter()


 def detectPageIndex(txt):
 	index = 1
 	while txt[-index:].isnumeric():
 		index += 1
 	if index == 1 or index > 6:
 		return False
 	return txt[-index+1:]

 # Process all the pages in the pdf
 for i in range(srcpdf.numPages):
 	pageComplete = True
 	
 	curpg = srcpdf.getPage(i)
 	curtxt = re.sub('[^a-zA-Z0-9]+', '', curpg.extractText())
 	curidx = detectPageIndex(curtxt)

 	# Unless this is not the last page, check if the page
 	# is complete and not an intermediate slide.
 	#
 	# A page is complete when the next page contains
 	# completly different text.
 	# A page is intermediate when the next page contains
 	# similar starting text to the current slide
 	if i != srcpdf.numPages - 1:
 		nextpg = srcpdf.getPage(i + 1)
 		nexttxt = re.sub('[^a-zA-Z0-9]+', '', nextpg.extractText())
 		nextidx = detectPageIndex(nexttxt)

 		if (nextidx != False) and (curidx != False):
 			pageComplete = curidx != nextidx
 		else:
 			# if the starting text of the two slides is close,
 			# they are probably the same slide
 			if distance(curtxt, nexttxt[:len(curtxt)]) < MAX_DISTANCE:
 				pageComplete = False

 	if pageComplete:
 		dstpdf.addPage(curpg)

 # Write filtered pdf
 dstpdf.write(outfile)

 infile.close()
 outfile.close()
	#!/bin/python3
	from PyPDF2 import PdfFileWriter, PdfFileReader
	from Levenshtein import distance
	import sys
	import re

	# Requires PyPDF2 and python-Levenshtein.
	# Install them with `pip install PyPDF2 python-Levenshtein

	# Maximum distance between the same content located in
	# two different slides (accounts for some lib/pdf inconsistencies)
	MAX_DISTANCE = 10


	infile, outfile = None, None

	if len(sys.argv) != 3:
	print('usage: cleanup-slides [input.pdf] [output.pdf]')
	quit()

	try:
	infile = open(sys.argv[1], 'rb')
	outfile = open(sys.argv[2], 'wb')
	except:
	print("Failed to open specified files")
	quit()

	srcpdf = PdfFileReader(infile)
	dstpdf = PdfFileWriter()


	def detectPageIndex(txt):
	index = 1
	while txt[-index:].isnumeric():
	index += 1
	if index == 1 or index > 6:
	return False
	return txt[-index+1:]

	# Process all the pages in the pdf
	for i in range(srcpdf.numPages):
	pageComplete = True

	curpg = srcpdf.getPage(i)
	curtxt = re.sub('[^a-zA-Z0-9]+', '', curpg.extractText())
	curidx = detectPageIndex(curtxt)

	# Unless this is not the last page, check if the page
	# is complete and not an intermediate slide.
	#
	# A page is complete when the next page contains
	# completly different text.
	# A page is intermediate when the next page contains
	# similar starting text to the current slide
	if i != srcpdf.numPages - 1:
	nextpg = srcpdf.getPage(i + 1)
	nexttxt = re.sub('[^a-zA-Z0-9]+', '', nextpg.extractText())
	nextidx = detectPageIndex(nexttxt)

	if (nextidx != False) and (curidx != False):
	pageComplete = curidx != nextidx
	else:
	# if the starting text of the two slides is close,
	# they are probably the same slide
	if distance(curtxt, nexttxt[:len(curtxt)]) < MAX_DISTANCE:
	pageComplete = False

	if pageComplete:
	dstpdf.addPage(curpg)

	# Write filtered pdf
	dstpdf.write(outfile)

	infile.close()
	outfile.close()