Last active
August 16, 2024 15:25
-
-
Save guillaumemolter/4f666878710c6fc6c4a19a5ad667a5f7 to your computer and use it in GitHub Desktop.
A python3 run to run on MacOS to extract links from a PDF document
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Requirements | |
# pip3 install fitz | |
# pip3 install xlsxwriter | |
# pip3 install urllib | |
# | |
# Run the script | |
# python3 extract-links.py | |
import fitz # PyMuPDF | |
import urllib.parse | |
import re | |
import xlsxwriter | |
import requests | |
def check_link_status(link): | |
try: | |
response = requests.head(link) | |
return response.status_code | |
except: | |
return 500 | |
def extract_links(pdf_path): | |
# Open the PDF file | |
pdf = fitz.open(pdf_path) | |
links = [] | |
# Iterate over each page | |
for page_num in range(len(pdf)): | |
# Get the page | |
page = pdf[page_num] | |
# Get the list of links in the page | |
page_links = page.get_links() | |
# Iterate over each link | |
for link in page_links: | |
# Check if the link is a URI | |
if link['kind'] == fitz.LINK_URI: | |
# Append the link and page number to the list | |
links.append((link['uri'], page_num + 1)) | |
# Close the PDF file | |
pdf.close() | |
return links | |
def write_links_to_file(links, file_path): | |
links.sort() | |
with open(file_path, 'w') as f: | |
for link,page_num in links: | |
f.write(link + ' : page ' + str( page_num ) + '\n') | |
def get_subfolder_count(url): | |
subfolder_pattern = r"/[^/]+" | |
match = re.findall(subfolder_pattern, url) | |
return len(match) | |
def write_to_excel(worksheet,links): | |
row = 0 | |
for link,page_num in links: | |
worksheet.write(row, 0, link) | |
worksheet.write(row, 1, page_num) | |
row += 1 | |
return worksheet | |
def main(): | |
pdf_path = 'report.pdf' | |
hsph_links_path = 'hsph.txt' | |
hsph_top_level_path = 'hsph_top_level.txt' | |
hsph_multi_level_path = 'hsph_multi_level.txt' | |
other_links_path = 'others.txt' | |
broken_path = 'broken.txt' | |
redirecting_path = 'redirecting.txt' | |
workbook = xlsxwriter.Workbook('links.xlsx') | |
summary_worksheet = workbook.add_worksheet('Summary') | |
raw_worksheet = workbook.add_worksheet('Raw links') | |
hsph_worksheet = workbook.add_worksheet('All Unique HSPH links') | |
hsph_top_worksheet = workbook.add_worksheet('Unique HSPH Top links') | |
hsph_multi_worksheet = workbook.add_worksheet('Unique HSPH Deep links') | |
other_worksheet = workbook.add_worksheet('External') | |
redirecting_worksheet = workbook.add_worksheet('Redirections') | |
broken_worksheet = workbook.add_worksheet('Broken') | |
# Get all links | |
links = extract_links(pdf_path) | |
print(f"Total links extracted: {len(links)}") | |
summary_row = 0 | |
summary_worksheet.write(summary_row, 0, f"Total links extracted: {len(links)}") | |
summary_row+= 1 | |
write_to_excel(raw_worksheet,links) | |
# Extract main website links | |
hsph_links = list(set([(link,page_num) for link,page_num in links if 'www.hsph' in link])) | |
hsph_top_level_links = [] | |
hsph_multi_level_links = [] | |
for link,page_num in hsph_links: | |
if get_subfolder_count(link) > 2: | |
hsph_multi_level_links.append((link,page_num)) | |
else: | |
hsph_top_level_links.append((link,page_num)) | |
write_links_to_file(hsph_links, hsph_links_path) | |
write_to_excel(hsph_worksheet,hsph_links) | |
print(f"Unique HSPH links: {len(hsph_links)}") | |
summary_worksheet.write(summary_row, 0, f"Unique HSPH links: {len(hsph_links)}") | |
summary_row+= 1 | |
write_links_to_file(hsph_top_level_links, hsph_top_level_path) | |
write_to_excel(hsph_top_worksheet,hsph_top_level_links) | |
print(f"Unique HSPH top level links: {len(hsph_top_level_links)}") | |
summary_worksheet.write(summary_row, 0, f"Unique HSPH top level links: {len(hsph_top_level_links)}") | |
summary_row+= 1 | |
write_links_to_file(hsph_multi_level_links, hsph_multi_level_path) | |
write_to_excel(hsph_multi_worksheet,hsph_multi_level_links) | |
print(f"Unique HSPH deep links: {len(hsph_multi_level_links)}") | |
summary_worksheet.write(summary_row, 0, f"Unique HSPH deep links: {len(hsph_multi_level_links)}") | |
summary_row+= 1 | |
# Extract other domains links | |
other_links = list(set([link for link in links if link not in hsph_links])) | |
write_links_to_file(other_links, other_links_path) | |
write_to_excel(other_worksheet,other_links) | |
summary_worksheet.write(summary_row, 0, f"Unique external links: {len(other_links)}") | |
summary_row+= 1 | |
print(f"Unique external links: {len(other_links)}") | |
# Check links | |
redirecting_links = [] | |
broken_links = [] | |
redirecting_row = 0 | |
broken_row = 0 | |
for link,page_num in links: | |
code = check_link_status(link) | |
if 300 <= code < 400: | |
redirecting_links.append((link + ' code: ' + str(code),page_num)) | |
redirecting_worksheet.write(redirecting_row, 0, link) | |
redirecting_worksheet.write(redirecting_row, 1, str(code)) | |
redirecting_worksheet.write(redirecting_row, 2, str(page_num)) | |
redirecting_row+= 1 | |
if 400 <= code < 500: | |
broken_links.append((link + ' code: ' + str(code),page_num)) | |
broken_worksheet.write(broken_row, 0, link) | |
broken_worksheet.write(broken_row, 1, str(code)) | |
broken_worksheet.write(broken_row, 2, str(page_num)) | |
broken_row+= 1 | |
write_links_to_file(redirecting_links, redirecting_path) | |
print(f"Redirecting links: {len(redirecting_links)}") | |
summary_worksheet.write(summary_row, 0, f"Redirecting links: {len(redirecting_links)}") | |
summary_row+= 1 | |
write_links_to_file(broken_links, broken_path) | |
print(f"Broken links: {len(broken_links)}") | |
summary_worksheet.write(summary_row, 0, f"Broken links: {len(broken_links)}") | |
summary_row+= 1 | |
workbook.close() | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This was mostly written using Claude AI, this is a "quick and dirty" document analysis and is not meant to be production code.