taufiqibrahim · May 3, 2022 18:21
diff --git a/read_pdf_files_transpose_compile_to_csv.py b/read_pdf_files_transpose_compile_to_csv.py
 """
 Requirements:
 pip install tabula-py
 pip install tabulate

 Read all *.pdf files inside PATH_DIR, transpose and compile into single CSV file
 """

 import pandas as pd
 import os
 from tabula import read_pdf

 PATH_DIR = '/mnt/c/tibrahim/ck/documents/customer-data-platform/'
 OUTPUT_CSV = os.path.join(PATH_DIR, 'output.csv')
 LIST_OF_DICTS = list()
 LIST_OF_FILES = [f for f in os.listdir(PATH_DIR) if 'pdf' in f]

 for f in LIST_OF_FILES:
    FILEPATH = os.path.join(PATH_DIR, f)
    print(FILEPATH)

    document = read_pdf(FILEPATH,pages="all") #address of pdf file
    raw_table = document[0]

    raw_table.columns = ['key', 'value',]
    raw_table_records = raw_table.to_dict('records')

    data = dict()
    for record in raw_table_records:
        value = record['value']
        if isinstance(value, str):
            data[record['key']] = value.replace("\r", " ")
        else:
            data[record['key']] = value

    LIST_OF_DICTS.append(data)

 df = pd.DataFrame(LIST_OF_DICTS)
 df.to_csv(OUTPUT_CSV, index=False, quoting=1)
	"""
	Requirements:
	pip install tabula-py
	pip install tabulate

	Read all *.pdf files inside PATH_DIR, transpose and compile into single CSV file
	"""

	import pandas as pd
	import os
	from tabula import read_pdf

	PATH_DIR = '/mnt/c/tibrahim/ck/documents/customer-data-platform/'
	OUTPUT_CSV = os.path.join(PATH_DIR, 'output.csv')
	LIST_OF_DICTS = list()
	LIST_OF_FILES = [f for f in os.listdir(PATH_DIR) if 'pdf' in f]

	for f in LIST_OF_FILES:
	FILEPATH = os.path.join(PATH_DIR, f)
	print(FILEPATH)

	document = read_pdf(FILEPATH,pages="all") #address of pdf file
	raw_table = document[0]

	raw_table.columns = ['key', 'value',]
	raw_table_records = raw_table.to_dict('records')

	data = dict()
	for record in raw_table_records:
	value = record['value']
	if isinstance(value, str):
	data[record['key']] = value.replace("\r", " ")
	else:
	data[record['key']] = value

	LIST_OF_DICTS.append(data)

	df = pd.DataFrame(LIST_OF_DICTS)
	df.to_csv(OUTPUT_CSV, index=False, quoting=1)