oliora · December 21, 2024 16:06
diff --git a/extract_tables_from_pdf.py b/extract_tables_from_pdf.py
 #!/usr/bin/env python3

 '''
 Usage (on Mac):

 Extract tables to clipboard. Simply paste it to Google Spreadsheets afterwards:

    python3 extract_tables_from_pdf.py ~/Downloads/doc.pdf" --remove-eols --pages '27-30' | pbcopy

 Extract to a file:

    python3 extract_tables_from_pdf.py ~/Downloads/doc.pdf" --remove-eols --pages '27-30' > filename.txt

 Requirements:
 1. [Camelot](https://camelot-py.readthedocs.io/en/master/)

    pip install "camelot-py[base]"

 Note that you need to install OS dependencies as described in
 https://camelot-py.readthedocs.io/en/master/user/install-deps.html

 On Mac:

    brew install ghostscript tcl-tk


 You may also need to downgrade PyPDF after installing Camelot:

    pip install 'PyPDF2<3.0'
 '''

 import argparse
 import io
 import logging
 import os

 import pandas
 import camelot


 logger = logging.getLogger()


 def main():
    # TODO: extract page titles
    argp = argparse.ArgumentParser("Extract tables from PDF to CSV files")
    argp.add_argument('input',
                      help='Path to pdf file to extract tables data')
    argp.add_argument('--pages', '-p', default='all',
                      help='Comma separated list of pages to extract like "1", "5,7", "5-10", "2,7-end", "all" (Default: all)')
    argp.add_argument('--min-accuracy', '-a', type=float, default=98,
                      help='Minimal recognition accuracy per cent (Default: 98)')
    argp.add_argument('--required-header',
                      help='Only save tables that have this field in the header')
    argp.add_argument('--remove-eols', action='store_true', help='Replace EOLs with spaces in table data')
    args = argp.parse_args()

    logging.basicConfig(level=logging.INFO, format='%(asctime)s.%(msecs)03d %(levelname)5s [%(name)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

    logger.info(f'Read pages {args.pages} from file {args.input}')
    tables = camelot.read_pdf(args.input, pages=args.pages)

    def header_filter(headers: pandas.Series) -> bool:
        if args.required_header:
            return any(h == args.required_header for h in headers)
        return True

    def transform(series: pandas.Series) -> pandas.Series:
        if args.remove_eols:
            series = series.str.replace('\\s*\n', ' ', regex=True)
        return series

    total_count = 0

    with io.StringIO() as os:
        for t in tables:
            total_count += 1
            if t.accuracy < args.min_accuracy:
                logger.info(f'Skip table {t.page}.{t.order} because accuracy {t.accuracy} is too low')
                continue

            df: pandas.DataFrame = t.df

            if not header_filter(transform(df.iloc[0])):
                logger.info(f'Skip table {t.page}.{t.order}, because of the header filter')
                continue

            logger.info(f'Save table {t.page}.{t.order}')
            for i in df:
                df[i] = transform(df[i])
            os.write(f'# Table {t.page}.{t.order}\n')
            df.to_csv(os, header=False, index=False, sep='\t', lineterminator='\n')
            os.write('\n')
        print(os.getvalue())

    logger.info(f'Found {total_count} tables')    


 if __name__ == '__main__':
    main()
	#!/usr/bin/env python3

	'''
	Usage (on Mac):

	Extract tables to clipboard. Simply paste it to Google Spreadsheets afterwards:

	python3 extract_tables_from_pdf.py ~/Downloads/doc.pdf" --remove-eols --pages '27-30' \| pbcopy

	Extract to a file:

	python3 extract_tables_from_pdf.py ~/Downloads/doc.pdf" --remove-eols --pages '27-30' > filename.txt

	Requirements:
	1. [Camelot](https://camelot-py.readthedocs.io/en/master/)

	pip install "camelot-py[base]"

	Note that you need to install OS dependencies as described in
	https://camelot-py.readthedocs.io/en/master/user/install-deps.html

	On Mac:

	brew install ghostscript tcl-tk


	You may also need to downgrade PyPDF after installing Camelot:

	pip install 'PyPDF2<3.0'
	'''

	import argparse
	import io
	import logging
	import os

	import pandas
	import camelot


	logger = logging.getLogger()


	def main():
	# TODO: extract page titles
	argp = argparse.ArgumentParser("Extract tables from PDF to CSV files")
	argp.add_argument('input',
	help='Path to pdf file to extract tables data')
	argp.add_argument('--pages', '-p', default='all',
	help='Comma separated list of pages to extract like "1", "5,7", "5-10", "2,7-end", "all" (Default: all)')
	argp.add_argument('--min-accuracy', '-a', type=float, default=98,
	help='Minimal recognition accuracy per cent (Default: 98)')
	argp.add_argument('--required-header',
	help='Only save tables that have this field in the header')
	argp.add_argument('--remove-eols', action='store_true', help='Replace EOLs with spaces in table data')
	args = argp.parse_args()

	logging.basicConfig(level=logging.INFO, format='%(asctime)s.%(msecs)03d %(levelname)5s [%(name)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

	logger.info(f'Read pages {args.pages} from file {args.input}')
	tables = camelot.read_pdf(args.input, pages=args.pages)

	def header_filter(headers: pandas.Series) -> bool:
	if args.required_header:
	return any(h == args.required_header for h in headers)
	return True

	def transform(series: pandas.Series) -> pandas.Series:
	if args.remove_eols:
	series = series.str.replace('\\s*\n', ' ', regex=True)
	return series

	total_count = 0

	with io.StringIO() as os:
	for t in tables:
	total_count += 1
	if t.accuracy < args.min_accuracy:
	logger.info(f'Skip table {t.page}.{t.order} because accuracy {t.accuracy} is too low')
	continue

	df: pandas.DataFrame = t.df

	if not header_filter(transform(df.iloc[0])):
	logger.info(f'Skip table {t.page}.{t.order}, because of the header filter')
	continue

	logger.info(f'Save table {t.page}.{t.order}')
	for i in df:
	df[i] = transform(df[i])
	os.write(f'# Table {t.page}.{t.order}\n')
	df.to_csv(os, header=False, index=False, sep='\t', lineterminator='\n')
	os.write('\n')
	print(os.getvalue())

	logger.info(f'Found {total_count} tables')


	if __name__ == '__main__':
	main()