subpath · April 1, 2019 04:46
diff --git a/data_extraction_from_pdf.py b/data_extraction_from_pdf.py
 """Read pdf files and extract tabular data."""
 from tabula import read_pdf


 def file_to_dataframe(file_path):
    """Read and save tables from pdf."""
    tables = read_pdf(file_path, multiple_tables=True)
    # extract and format potency table
    potency_table = tables[0]
    potency_table = potency_table.drop(potency_table.index[[0, 1]])
    potency_table = potency_table.drop(potency_table.columns[2], axis=1)
    potency_table.columns = ['cannabinoid', 'w/w']
    # extract and format terpenes table
    terpenes_table = tables[3]
    terpenes_table = terpenes_table.drop(terpenes_table.index[[0, 1]])
    terpenes_table = terpenes_table.drop(terpenes_table.columns[2], axis=1)
    terpenes_table.columns = ['terpenes', 'w/w']
    # save tables as csv
    potency_table.to_csv(file_path.replace('.pdf', '_potency.csv'))
    terpenes_table.to_csv(file_path.replace('.pdf', '_terpenes.csv'))
	"""Read pdf files and extract tabular data."""
	from tabula import read_pdf


	def file_to_dataframe(file_path):
	"""Read and save tables from pdf."""
	tables = read_pdf(file_path, multiple_tables=True)
	# extract and format potency table
	potency_table = tables[0]
	potency_table = potency_table.drop(potency_table.index[[0, 1]])
	potency_table = potency_table.drop(potency_table.columns[2], axis=1)
	potency_table.columns = ['cannabinoid', 'w/w']
	# extract and format terpenes table
	terpenes_table = tables[3]
	terpenes_table = terpenes_table.drop(terpenes_table.index[[0, 1]])
	terpenes_table = terpenes_table.drop(terpenes_table.columns[2], axis=1)
	terpenes_table.columns = ['terpenes', 'w/w']
	# save tables as csv
	potency_table.to_csv(file_path.replace('.pdf', '_potency.csv'))
	terpenes_table.to_csv(file_path.replace('.pdf', '_terpenes.csv'))
No results found