Skip to content

Instantly share code, notes, and snippets.

@oliora
Last active December 21, 2024 16:06
Show Gist options
  • Save oliora/0f24eeab893efe87840431c319c58e43 to your computer and use it in GitHub Desktop.
Save oliora/0f24eeab893efe87840431c319c58e43 to your computer and use it in GitHub Desktop.
Extract tables from PDF into CSV
#!/usr/bin/env python3
'''
Usage (on Mac):
Extract tables to clipboard. Simply paste it to Google Spreadsheets afterwards:
python3 extract_tables_from_pdf.py ~/Downloads/doc.pdf" --remove-eols --pages '27-30' | pbcopy
Extract to a file:
python3 extract_tables_from_pdf.py ~/Downloads/doc.pdf" --remove-eols --pages '27-30' > filename.txt
Requirements:
1. [Camelot](https://camelot-py.readthedocs.io/en/master/)
pip install "camelot-py[base]"
Note that you need to install OS dependencies as described in
https://camelot-py.readthedocs.io/en/master/user/install-deps.html
On Mac:
brew install ghostscript tcl-tk
You may also need to downgrade PyPDF after installing Camelot:
pip install 'PyPDF2<3.0'
'''
import argparse
import io
import logging
import os
import pandas
import camelot
logger = logging.getLogger()
def main():
# TODO: extract page titles
argp = argparse.ArgumentParser("Extract tables from PDF to CSV files")
argp.add_argument('input',
help='Path to pdf file to extract tables data')
argp.add_argument('--pages', '-p', default='all',
help='Comma separated list of pages to extract like "1", "5,7", "5-10", "2,7-end", "all" (Default: all)')
argp.add_argument('--min-accuracy', '-a', type=float, default=98,
help='Minimal recognition accuracy per cent (Default: 98)')
argp.add_argument('--required-header',
help='Only save tables that have this field in the header')
argp.add_argument('--remove-eols', action='store_true', help='Replace EOLs with spaces in table data')
args = argp.parse_args()
logging.basicConfig(level=logging.INFO, format='%(asctime)s.%(msecs)03d %(levelname)5s [%(name)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
logger.info(f'Read pages {args.pages} from file {args.input}')
tables = camelot.read_pdf(args.input, pages=args.pages)
def header_filter(headers: pandas.Series) -> bool:
if args.required_header:
return any(h == args.required_header for h in headers)
return True
def transform(series: pandas.Series) -> pandas.Series:
if args.remove_eols:
series = series.str.replace('\\s*\n', ' ', regex=True)
return series
total_count = 0
with io.StringIO() as os:
for t in tables:
total_count += 1
if t.accuracy < args.min_accuracy:
logger.info(f'Skip table {t.page}.{t.order} because accuracy {t.accuracy} is too low')
continue
df: pandas.DataFrame = t.df
if not header_filter(transform(df.iloc[0])):
logger.info(f'Skip table {t.page}.{t.order}, because of the header filter')
continue
logger.info(f'Save table {t.page}.{t.order}')
for i in df:
df[i] = transform(df[i])
os.write(f'# Table {t.page}.{t.order}\n')
df.to_csv(os, header=False, index=False, sep='\t', lineterminator='\n')
os.write('\n')
print(os.getvalue())
logger.info(f'Found {total_count} tables')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment