Kuo-TingKai · July 27, 2022 08:48
diff --git a/PdfMerger.py b/PdfMerger.py
 # Find links to pdf files in HTML with BeautifulSoup
 import os
 import urllib
 from bs4 import BeautifulSoup
 from PyPDF2 import PdfFileMerger
 import numpy as np 
 def download_file(download_url, filename):
    response = urllib.request.urlopen(download_url)    
    file = open(filename, 'wb')
    file.write(response.read())
    file.close()

 def reset_eof_of_pdf_return_stream(pdf_stream_in:list):
    # find the line position of the EOF
    for i, x in enumerate(pdf_stream_in[::-1]):
        if b'%%EOF' in x:
            actual_line = len(pdf_stream_in)-i
            print(f'EOF found at line position {-i} = actual {actual_line}, with value {x}')
            break
    # return the list up to that point
    return pdf_stream_in[:actual_line]

 my_url = 'https://homepages.cwi.nl/~fehr/QC2020/'
 html=urllib.request.urlopen(my_url).read()
 content = BeautifulSoup(html)
 current_link = ''

 pdfs = []
 for link in content.find_all('a'):
    current_link = link.get('href')
    if current_link.endswith('pdf') and "/" not in current_link: 
        print('link of pdf: ' + current_link)
        pdfs.append(current_link)

 np.savetxt("pdfs.txt",pdfs,fmt="%s")

 # for pdf in pdfs: 
 #     download_file(my_url, pdf)

 # for pdf in pdfs:
 #     # opens the file for reading
 #     with open('pdf_files/'+pdf, 'rb') as p:
 #         txt = (p.readlines())

 #     # get the new list terminating correctly
 #     txt_right_eof = reset_eof_of_pdf_return_stream(txt)

 #     # write to new pdf
 #     with open(pdf.split('.')[0]+"_fixed.pdf"), 'wb' as f:
 #         f.writelines(txt_right_eof)

 # merger = PdfFileMerger()

 # for pdf in pdfs:
 #     merger.append(pdf)

 # merger.write("qc_book_all.pdf")
 # merger.close()


 # #Create and instance of PdfFileMerger() class
 # merger = PdfFileMerger()
 # #Create a list with PDF file names
 # path_to_files = r'pdf_files/'
 # #Get the file names in the directory
 # for root, dirs, pdfs in os.walk(path_to_files):
 #     #Iterate over the list of file names
 #     for file_name in pdfs:
 #         #Append PDF files
 #         merger.append(path_to_files + file_name)
 # #Write out the merged PDF
 # merger.write("merged_all_pages.pdf")
 # merger.close()


 # import unicodedata
 # import numpy as np 
 # import re
 # full = np.loadtxt("pdfs.txt")
 # half = np.copy(full)
 # for txt in half:
 #     txt = unicodedata.normalize('NFKC', res = (re.sub('.', lambda x: r'\u % 04X' % ord(x.group()), txt))).encode('ascii', 'ignore')
 # np.savetxt("pdfs_half.txt",half)
	# Find links to pdf files in HTML with BeautifulSoup
	import os
	import urllib
	from bs4 import BeautifulSoup
	from PyPDF2 import PdfFileMerger
	import numpy as np
	def download_file(download_url, filename):
	response = urllib.request.urlopen(download_url)
	file = open(filename, 'wb')
	file.write(response.read())
	file.close()

	def reset_eof_of_pdf_return_stream(pdf_stream_in:list):
	# find the line position of the EOF
	for i, x in enumerate(pdf_stream_in[::-1]):
	if b'%%EOF' in x:
	actual_line = len(pdf_stream_in)-i
	print(f'EOF found at line position {-i} = actual {actual_line}, with value {x}')
	break
	# return the list up to that point
	return pdf_stream_in[:actual_line]

	my_url = 'https://homepages.cwi.nl/~fehr/QC2020/'
	html=urllib.request.urlopen(my_url).read()
	content = BeautifulSoup(html)
	current_link = ''

	pdfs = []
	for link in content.find_all('a'):
	current_link = link.get('href')
	if current_link.endswith('pdf') and "/" not in current_link:
	print('link of pdf: ' + current_link)
	pdfs.append(current_link)

	np.savetxt("pdfs.txt",pdfs,fmt="%s")

	# for pdf in pdfs:
	# download_file(my_url, pdf)

	# for pdf in pdfs:
	# # opens the file for reading
	# with open('pdf_files/'+pdf, 'rb') as p:
	# txt = (p.readlines())

	# # get the new list terminating correctly
	# txt_right_eof = reset_eof_of_pdf_return_stream(txt)

	# # write to new pdf
	# with open(pdf.split('.')[0]+"_fixed.pdf"), 'wb' as f:
	# f.writelines(txt_right_eof)

	# merger = PdfFileMerger()

	# for pdf in pdfs:
	# merger.append(pdf)

	# merger.write("qc_book_all.pdf")
	# merger.close()


	# #Create and instance of PdfFileMerger() class
	# merger = PdfFileMerger()
	# #Create a list with PDF file names
	# path_to_files = r'pdf_files/'
	# #Get the file names in the directory
	# for root, dirs, pdfs in os.walk(path_to_files):
	# #Iterate over the list of file names
	# for file_name in pdfs:
	# #Append PDF files
	# merger.append(path_to_files + file_name)
	# #Write out the merged PDF
	# merger.write("merged_all_pages.pdf")
	# merger.close()


	# import unicodedata
	# import numpy as np
	# import re
	# full = np.loadtxt("pdfs.txt")
	# half = np.copy(full)
	# for txt in half:
	# txt = unicodedata.normalize('NFKC', res = (re.sub('.', lambda x: r'\u % 04X' % ord(x.group()), txt))).encode('ascii', 'ignore')
	# np.savetxt("pdfs_half.txt",half)