Skip to content

Instantly share code, notes, and snippets.

@Kuo-TingKai
Last active July 27, 2022 08:48
Show Gist options
  • Save Kuo-TingKai/e5451c4148d47ccdb87d4d8e29d74dca to your computer and use it in GitHub Desktop.
Save Kuo-TingKai/e5451c4148d47ccdb87d4d8e29d74dca to your computer and use it in GitHub Desktop.
(Not Done Yet) Merge all pdf files in QC course
# Find links to pdf files in HTML with BeautifulSoup
import os
import urllib
from bs4 import BeautifulSoup
from PyPDF2 import PdfFileMerger
import numpy as np
def download_file(download_url, filename):
response = urllib.request.urlopen(download_url)
file = open(filename, 'wb')
file.write(response.read())
file.close()
def reset_eof_of_pdf_return_stream(pdf_stream_in:list):
# find the line position of the EOF
for i, x in enumerate(pdf_stream_in[::-1]):
if b'%%EOF' in x:
actual_line = len(pdf_stream_in)-i
print(f'EOF found at line position {-i} = actual {actual_line}, with value {x}')
break
# return the list up to that point
return pdf_stream_in[:actual_line]
my_url = 'https://homepages.cwi.nl/~fehr/QC2020/'
html=urllib.request.urlopen(my_url).read()
content = BeautifulSoup(html)
current_link = ''
pdfs = []
for link in content.find_all('a'):
current_link = link.get('href')
if current_link.endswith('pdf') and "/" not in current_link:
print('link of pdf: ' + current_link)
pdfs.append(current_link)
np.savetxt("pdfs.txt",pdfs,fmt="%s")
# for pdf in pdfs:
# download_file(my_url, pdf)
# for pdf in pdfs:
# # opens the file for reading
# with open('pdf_files/'+pdf, 'rb') as p:
# txt = (p.readlines())
# # get the new list terminating correctly
# txt_right_eof = reset_eof_of_pdf_return_stream(txt)
# # write to new pdf
# with open(pdf.split('.')[0]+"_fixed.pdf"), 'wb' as f:
# f.writelines(txt_right_eof)
# merger = PdfFileMerger()
# for pdf in pdfs:
# merger.append(pdf)
# merger.write("qc_book_all.pdf")
# merger.close()
# #Create and instance of PdfFileMerger() class
# merger = PdfFileMerger()
# #Create a list with PDF file names
# path_to_files = r'pdf_files/'
# #Get the file names in the directory
# for root, dirs, pdfs in os.walk(path_to_files):
# #Iterate over the list of file names
# for file_name in pdfs:
# #Append PDF files
# merger.append(path_to_files + file_name)
# #Write out the merged PDF
# merger.write("merged_all_pages.pdf")
# merger.close()
# import unicodedata
# import numpy as np
# import re
# full = np.loadtxt("pdfs.txt")
# half = np.copy(full)
# for txt in half:
# txt = unicodedata.normalize('NFKC', res = (re.sub('.', lambda x: r'\u % 04X' % ord(x.group()), txt))).encode('ascii', 'ignore')
# np.savetxt("pdfs_half.txt",half)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment