Last active
July 27, 2022 08:48
-
-
Save Kuo-TingKai/e5451c4148d47ccdb87d4d8e29d74dca to your computer and use it in GitHub Desktop.
(Not Done Yet) Merge all pdf files in QC course
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Find links to pdf files in HTML with BeautifulSoup | |
import os | |
import urllib | |
from bs4 import BeautifulSoup | |
from PyPDF2 import PdfFileMerger | |
import numpy as np | |
def download_file(download_url, filename): | |
response = urllib.request.urlopen(download_url) | |
file = open(filename, 'wb') | |
file.write(response.read()) | |
file.close() | |
def reset_eof_of_pdf_return_stream(pdf_stream_in:list): | |
# find the line position of the EOF | |
for i, x in enumerate(pdf_stream_in[::-1]): | |
if b'%%EOF' in x: | |
actual_line = len(pdf_stream_in)-i | |
print(f'EOF found at line position {-i} = actual {actual_line}, with value {x}') | |
break | |
# return the list up to that point | |
return pdf_stream_in[:actual_line] | |
my_url = 'https://homepages.cwi.nl/~fehr/QC2020/' | |
html=urllib.request.urlopen(my_url).read() | |
content = BeautifulSoup(html) | |
current_link = '' | |
pdfs = [] | |
for link in content.find_all('a'): | |
current_link = link.get('href') | |
if current_link.endswith('pdf') and "/" not in current_link: | |
print('link of pdf: ' + current_link) | |
pdfs.append(current_link) | |
np.savetxt("pdfs.txt",pdfs,fmt="%s") | |
# for pdf in pdfs: | |
# download_file(my_url, pdf) | |
# for pdf in pdfs: | |
# # opens the file for reading | |
# with open('pdf_files/'+pdf, 'rb') as p: | |
# txt = (p.readlines()) | |
# # get the new list terminating correctly | |
# txt_right_eof = reset_eof_of_pdf_return_stream(txt) | |
# # write to new pdf | |
# with open(pdf.split('.')[0]+"_fixed.pdf"), 'wb' as f: | |
# f.writelines(txt_right_eof) | |
# merger = PdfFileMerger() | |
# for pdf in pdfs: | |
# merger.append(pdf) | |
# merger.write("qc_book_all.pdf") | |
# merger.close() | |
# #Create and instance of PdfFileMerger() class | |
# merger = PdfFileMerger() | |
# #Create a list with PDF file names | |
# path_to_files = r'pdf_files/' | |
# #Get the file names in the directory | |
# for root, dirs, pdfs in os.walk(path_to_files): | |
# #Iterate over the list of file names | |
# for file_name in pdfs: | |
# #Append PDF files | |
# merger.append(path_to_files + file_name) | |
# #Write out the merged PDF | |
# merger.write("merged_all_pages.pdf") | |
# merger.close() | |
# import unicodedata | |
# import numpy as np | |
# import re | |
# full = np.loadtxt("pdfs.txt") | |
# half = np.copy(full) | |
# for txt in half: | |
# txt = unicodedata.normalize('NFKC', res = (re.sub('.', lambda x: r'\u % 04X' % ord(x.group()), txt))).encode('ascii', 'ignore') | |
# np.savetxt("pdfs_half.txt",half) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment