Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save taufiqibrahim/c2f05e9a16166d9fb3171425191584c0 to your computer and use it in GitHub Desktop.
Save taufiqibrahim/c2f05e9a16166d9fb3171425191584c0 to your computer and use it in GitHub Desktop.
Read all *.pdf files inside PATH_DIR, transpose and compile into single CSV file
"""
Requirements:
pip install tabula-py
pip install tabulate
Read all *.pdf files inside PATH_DIR, transpose and compile into single CSV file
"""
import pandas as pd
import os
from tabula import read_pdf
PATH_DIR = '/mnt/c/tibrahim/ck/documents/customer-data-platform/'
OUTPUT_CSV = os.path.join(PATH_DIR, 'output.csv')
LIST_OF_DICTS = list()
LIST_OF_FILES = [f for f in os.listdir(PATH_DIR) if 'pdf' in f]
for f in LIST_OF_FILES:
FILEPATH = os.path.join(PATH_DIR, f)
print(FILEPATH)
document = read_pdf(FILEPATH,pages="all") #address of pdf file
raw_table = document[0]
raw_table.columns = ['key', 'value',]
raw_table_records = raw_table.to_dict('records')
data = dict()
for record in raw_table_records:
value = record['value']
if isinstance(value, str):
data[record['key']] = value.replace("\r", " ")
else:
data[record['key']] = value
LIST_OF_DICTS.append(data)
df = pd.DataFrame(LIST_OF_DICTS)
df.to_csv(OUTPUT_CSV, index=False, quoting=1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment