Created
February 22, 2025 15:07
-
-
Save leandrotoledo/5196d29711e1ae431d77b816b71399b2 to your computer and use it in GitHub Desktop.
This script extracts tables from LCSO Reports (PDFs) and formats them as Markdown for Telegram messages (using <pre> tags for proper rendering).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pymupdf | |
def generate_markdown_table(rows): | |
"""Generates a Markdown table string wrapped in <pre> tags.""" | |
if not rows: | |
return "No incidents reported" | |
headers = ["Station", "Date / Time", "Location", "Incident"] | |
# Build the Markdown table string | |
table_md = "<pre>\n" | |
table_md += "|" + "|".join(headers) + "|\n" | |
table_md += "|" + "|".join("-" * len(header) for header in headers) + "|\n" | |
for row in rows: | |
table_md += "|" + "|".join(str(cell).replace("\n", " ") for cell in row) + "|\n" | |
table_md += "</pre>" | |
return table_md | |
def process_pdf(pdf_path): | |
"""Extracts data from the PDF and returns a list of rows.""" | |
doc = pymupdf.open(pdf_path) | |
rows = [] | |
for page in doc: | |
tabs = page.find_tables() | |
if tabs.tables: | |
for tab in tabs: | |
table_data = tab.extract() | |
if table_data: | |
station_info = table_data[0][0].split("\n")[0] | |
data = table_data[2:] | |
for row in data: | |
# Add station info to each row | |
row.insert(0, station_info) | |
rows.append(row) | |
return rows | |
if __name__ == "__main__": | |
pdf_path = "LCSO Report - 2025-02-18.pdf" | |
rows = process_pdf(pdf_path) | |
markdown_table = generate_markdown_table(rows) | |
print(markdown_table) | |
print() | |
pdf_path = "LCSO Report - 2025-02-19.pdf" | |
rows = process_pdf(pdf_path) | |
markdown_table = generate_markdown_table(rows) | |
print(markdown_table) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Output Example: