Last active
December 21, 2024 16:06
-
-
Save oliora/0f24eeab893efe87840431c319c58e43 to your computer and use it in GitHub Desktop.
Extract tables from PDF into CSV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
''' | |
Usage (on Mac): | |
Extract tables to clipboard. Simply paste it to Google Spreadsheets afterwards: | |
python3 extract_tables_from_pdf.py ~/Downloads/doc.pdf" --remove-eols --pages '27-30' | pbcopy | |
Extract to a file: | |
python3 extract_tables_from_pdf.py ~/Downloads/doc.pdf" --remove-eols --pages '27-30' > filename.txt | |
Requirements: | |
1. [Camelot](https://camelot-py.readthedocs.io/en/master/) | |
pip install "camelot-py[base]" | |
Note that you need to install OS dependencies as described in | |
https://camelot-py.readthedocs.io/en/master/user/install-deps.html | |
On Mac: | |
brew install ghostscript tcl-tk | |
You may also need to downgrade PyPDF after installing Camelot: | |
pip install 'PyPDF2<3.0' | |
''' | |
import argparse | |
import io | |
import logging | |
import os | |
import pandas | |
import camelot | |
logger = logging.getLogger() | |
def main(): | |
# TODO: extract page titles | |
argp = argparse.ArgumentParser("Extract tables from PDF to CSV files") | |
argp.add_argument('input', | |
help='Path to pdf file to extract tables data') | |
argp.add_argument('--pages', '-p', default='all', | |
help='Comma separated list of pages to extract like "1", "5,7", "5-10", "2,7-end", "all" (Default: all)') | |
argp.add_argument('--min-accuracy', '-a', type=float, default=98, | |
help='Minimal recognition accuracy per cent (Default: 98)') | |
argp.add_argument('--required-header', | |
help='Only save tables that have this field in the header') | |
argp.add_argument('--remove-eols', action='store_true', help='Replace EOLs with spaces in table data') | |
args = argp.parse_args() | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s.%(msecs)03d %(levelname)5s [%(name)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S') | |
logger.info(f'Read pages {args.pages} from file {args.input}') | |
tables = camelot.read_pdf(args.input, pages=args.pages) | |
def header_filter(headers: pandas.Series) -> bool: | |
if args.required_header: | |
return any(h == args.required_header for h in headers) | |
return True | |
def transform(series: pandas.Series) -> pandas.Series: | |
if args.remove_eols: | |
series = series.str.replace('\\s*\n', ' ', regex=True) | |
return series | |
total_count = 0 | |
with io.StringIO() as os: | |
for t in tables: | |
total_count += 1 | |
if t.accuracy < args.min_accuracy: | |
logger.info(f'Skip table {t.page}.{t.order} because accuracy {t.accuracy} is too low') | |
continue | |
df: pandas.DataFrame = t.df | |
if not header_filter(transform(df.iloc[0])): | |
logger.info(f'Skip table {t.page}.{t.order}, because of the header filter') | |
continue | |
logger.info(f'Save table {t.page}.{t.order}') | |
for i in df: | |
df[i] = transform(df[i]) | |
os.write(f'# Table {t.page}.{t.order}\n') | |
df.to_csv(os, header=False, index=False, sep='\t', lineterminator='\n') | |
os.write('\n') | |
print(os.getvalue()) | |
logger.info(f'Found {total_count} tables') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment