Skip to content

Instantly share code, notes, and snippets.

@rcsmit
Created March 31, 2025 15:59
Show Gist options
  • Save rcsmit/ca410fb8f9f80e37e110a928a9ba1a08 to your computer and use it in GitHub Desktop.
Save rcsmit/ca410fb8f9f80e37e110a928a9ba1a08 to your computer and use it in GitHub Desktop.
read_pdf_convert_to_xls.py
# Read a PDF and convert it to an XLS
# Tested on Bijlage B - Wob-deelbesluit 'Vaccinaties en medicatie oktober 2020'
# Not tested 100%
# inspired by https://x.com/Transparangst/status/1906717209423974689
def read_directly_from_pdf():
# read a file
# Install PyPDF2 if not already installed
# pip install PyPDF2
# Path to the PDF file
pdf_path = "C:/Users/rcxsm/Downloads/vac_med_okt_2020.pdf"
# Create a PDF reader object
reader = PdfReader(pdf_path)
all_text = ""
# Extract text from each page
number_of_pages = len(reader.pages)
for i,page in enumerate(reader.pages):
text = page.extract_text()
text = re.sub(r'(\n\d{6})', r'\1#', text)
for t in ["Reeds Openbaar", "Deels Openbaar", "Niet Openbaar", "Openbaar"]:
text = text.replace(t, f'#{t}#')
text = text.replace('#Deels #Openbaar##','#Deels Openbaar#')
text = text.replace('#Reeds #Openbaar##','#Reeds Openbaar#')
text = text.replace('#Niet #Openbaar##','#Niet Openbaar#')
text = text.replace("# ", "#")
text = text.replace("; 10.","#10.")
text = text.replace("; 11.","#11.")
text = text.replace("; buiten verzoek","#buiten verzoek")
print (f"Reading page {i}/{number_of_pages}")
all_text +="\n"+text
# if i>2:
# test purposes
# break
# Split text into rows and columns using '#' as a separator
rows = [line.split('#') for line in all_text.splitlines()]
# Convert to DataFrame
df = pd.DataFrame(rows)
print(df)
# Iterate through rows and check columns 3 to 8 for "10.2.a"
for i in ["a","b","c","d","e","f","g"]:
df[f"101{i}"] = df.iloc[:, 3:9].apply(lambda row: f"10.1.{i}" in row.values, axis=1)
df[f"102{i}"] = df.iloc[:, 3:9].apply(lambda row: f"10.2.{i}" in row.values, axis=1)
df["BuitenVerzoek"] = df.iloc[:, 3:9].apply(lambda row: "buiten verzoek" in row.values, axis=1)
df["111concept"] = df.iloc[:, 3:9].apply(lambda row: "11.1, concept" in row.values, axis=1)
df.to_csv("output.csv", index=False)
df.to_excel("output.xlsx", index=False)
def main():
read_directly_from_pdf()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment