Skip to content

Instantly share code, notes, and snippets.

@guz-anton
Created May 19, 2025 01:28
Show Gist options
  • Save guz-anton/eea24e8bac8c080bd4ee7c8261f59a3c to your computer and use it in GitHub Desktop.
Save guz-anton/eea24e8bac8c080bd4ee7c8261f59a3c to your computer and use it in GitHub Desktop.
PDF to CSV Extractor with python

PDF to CSV Extractor with python

A simple CLI tool for extracting tables from PDF files and saving them as CSV files.

Features

  • Extract tables from PDF files using local LLM inference
  • Save extracted tables as CSV files
  • Works with both single PDF files and directories of PDFs
  • Configurable model parameters

Requirements

  • Python 3.8+
  • LLM model file (e.g., Llama 2/3 or Mistral in GGUF format)

Installation

# Create a virtual environment
python -m venv venv
source venv/bin/activate  # On Windows: venv\Scripts\activate

# Install required packages
pip install llama-cpp-python pypdf pandas

Usage

Basic Usage

python pdf_table_extractor.py extract --input sample.pdf --output tables/ --model_path models/llama-2-7b-chat.Q4_0.gguf

Process all PDFs in a directory

python pdf_table_extractor.py extract --input pdf_directory/ --output tables/ --model_path models/llama-2-7b-chat.Q4_0.gguf

With GPU acceleration (if you have a compatible GPU)

python pdf_table_extractor.py extract --input sample.pdf --output tables/ --model_path models/llama-2-7b-chat.Q4_0.gguf --n_gpu_layers 32

Command Line Arguments

  • --input, -i: Path to input PDF file or directory
  • --output, -o: Directory to save CSV files
  • --model_path, -m: Path to the LLM model file
  • --n_ctx: Context window size (default: 2048)
  • --n_gpu_layers: Number of layers to offload to GPU (default: 0)

Obtaining LLM Models

You can download compatible GGUF models from:

  1. TheBloke's Hugging Face page
  2. Hugging Face Model Hub

Recommended models:

  • Llama-2-7B-Chat.Q4_0.gguf
  • Mistral-7B-Instruct-v0.2.Q4_0.gguf
  • Llama-3-8B-Instruct.Q4_0.gguf

How It Works

  1. The tool extracts text from each page of the PDF
  2. It uses the LLM to detect if a page contains a table
  3. When a table is detected, the LLM parses the table structure
  4. The parsed table is saved as a CSV file

Limitations

  • Complex tables with merged cells may not be parsed correctly
  • Very large tables exceeding the LLM's context window may be truncated
  • Image-based PDFs require OCR pre-processing (not included)

Future Improvements

  • Add OCR support for image-based PDFs
  • Improve table detection accuracy
  • Add support for more output formats
  • Implement batch processing with progress tracking
#!/usr/bin/env python3
"""
PDF to CSV Extractor with python
A CLI tool that extracts tables from PDFs and saves them as CSV files.
Built using LlamaCPP for the LLM backend and other open-source libraries.
Usage:
python pdf_table_extractor.py extract --input sample.pdf --output tables/ --model_path llama-2-7b-chat.Q4_0.gguf
"""
import argparse
import os
import json
import pandas as pd
from typing import List, Dict, Any
from llama_cpp import Llama
import pypdf
import re
import csv
import sys
from pathlib import Path
import logging
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('pdf_table_extractor')
class PDFTableExtractor:
def __init__(self, model_path: str, n_ctx: int = 2048, n_gpu_layers: int = 0):
"""
Initialize the PDF Table Extractor with the LLM model.
Args:
model_path: Path to the LLM model file
n_ctx: Context window size
n_gpu_layers: Number of layers to offload to GPU
"""
logger.info(f"Loading model from {model_path}")
try:
self.llm = Llama(
model_path=model_path,
n_ctx=n_ctx,
n_gpu_layers=n_gpu_layers
)
logger.info("Model loaded successfully")
except Exception as e:
logger.error(f"Failed to load model: {str(e)}")
sys.exit(1)
def extract_text_from_pdf(self, pdf_path: str) -> Dict[int, str]:
"""
Extract text from each page of the PDF.
Args:
pdf_path: Path to the PDF file
Returns:
Dictionary with page numbers as keys and page texts as values
"""
logger.info(f"Extracting text from {pdf_path}")
try:
with open(pdf_path, 'rb') as file:
pdf_reader = pypdf.PdfReader(file)
pages = {}
for i in range(len(pdf_reader.pages)):
page = pdf_reader.pages[i]
pages[i+1] = page.extract_text()
return pages
except Exception as e:
logger.error(f"Failed to extract text from PDF: {str(e)}")
sys.exit(1)
def detect_tables(self, page_text: str) -> bool:
"""
Detect if a page contains tables using the LLM.
Args:
page_text: Text content of a page
Returns:
Boolean indicating if tables are present
"""
prompt = f"""
The following is text extracted from a PDF page. Does this text contain a table?
Respond with CONTAINS_TABLE if the text seems to include tabular data or NO_TABLE if not.
TEXT:
{page_text[:1000]} # limiting to first 1000 chars for model context
RESPONSE:
"""
response = self.llm(prompt, max_tokens=50)
return "CONTAINS_TABLE" in response["choices"][0]["text"]
def parse_table(self, page_text: str) -> List[List[str]]:
"""
Parse table structure from text using the LLM.
Args:
page_text: Text content containing tables
Returns:
List of lists representing table rows and columns
"""
prompt = f"""
The following text contains tabular data extracted from a PDF.
Convert this data into a structured table format.
Output the table as a JSON array of arrays where each inner array represents a row of the table.
Only output valid JSON, nothing else.
TEXT:
{page_text[:3000]} # limiting to first 3000 chars for model context
JSON ARRAY:
"""
try:
response = self.llm(prompt, max_tokens=2000)
output_text = response["choices"][0]["text"].strip()
# Try to extract JSON using regex
match = re.search(r'\[\s*\[.*\]\s*\]', output_text, re.DOTALL)
if match:
output_text = match.group(0)
# Parse JSON array
table_data = json.loads(output_text)
return table_data
except json.JSONDecodeError:
logger.warning("Failed to parse table as JSON. Attempting alternative parsing method...")
# Fallback to a simpler parsing strategy
return self._fallback_table_parsing(page_text)
def _fallback_table_parsing(self, page_text: str) -> List[List[str]]:
"""
Fallback method to parse tables when JSON parsing fails.
Args:
page_text: Text content containing tables
Returns:
List of lists representing table rows and columns
"""
# Simple heuristic: Split by newlines and then by multiple spaces
rows = []
lines = page_text.split('\n')
for line in lines:
if line.strip():
# Split by multiple spaces (2 or more)
cells = re.split(r'\s{2,}', line.strip())
rows.append([cell.strip() for cell in cells])
return rows
def save_as_csv(self, table_data: List[List[str]], output_path: str):
"""
Save the extracted table data to a CSV file.
Args:
table_data: List of lists representing table rows and columns
output_path: Path to save the CSV file
"""
try:
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
for row in table_data:
writer.writerow(row)
logger.info(f"Saved table to {output_path}")
except Exception as e:
logger.error(f"Failed to save CSV: {str(e)}")
def process_pdf(self, pdf_path: str, output_dir: str):
"""
Process a PDF file and extract tables to CSV files.
Args:
pdf_path: Path to the PDF file
output_dir: Directory to save the CSV files
"""
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Extract text from PDF
pages = self.extract_text_from_pdf(pdf_path)
# Track how many tables we've extracted
table_count = 0
# Process each page
for page_num, page_text in pages.items():
logger.info(f"Processing page {page_num}")
# Check if page contains tables
if self.detect_tables(page_text):
logger.info(f"Table detected on page {page_num}")
# Parse table
table_data = self.parse_table(page_text)
# Save table to CSV
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_path = os.path.join(output_dir, f"{pdf_name}_page{page_num}_table{table_count}.csv")
self.save_as_csv(table_data, output_path)
table_count += 1
else:
logger.info(f"No tables detected on page {page_num}")
logger.info(f"Extraction complete. Found {table_count} tables.")
return table_count
def parse_args():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description='Extract tables from PDFs and save as CSV files.')
subparsers = parser.add_subparsers(dest='command', help='Command to run')
# Extract command
extract_parser = subparsers.add_parser('extract', help='Extract tables from PDFs')
extract_parser.add_argument('--input', '-i', required=True, help='Path to input PDF file or directory')
extract_parser.add_argument('--output', '-o', required=True, help='Directory to save CSV files')
extract_parser.add_argument('--model_path', '-m', required=True, help='Path to the LLM model file')
extract_parser.add_argument('--n_ctx', type=int, default=2048, help='Context window size')
extract_parser.add_argument('--n_gpu_layers', type=int, default=0, help='Number of layers to offload to GPU')
return parser.parse_args()
def main():
"""Main entry point for the script."""
args = parse_args()
if args.command == 'extract':
extractor = PDFTableExtractor(
model_path=args.model_path,
n_ctx=args.n_ctx,
n_gpu_layers=args.n_gpu_layers
)
input_path = Path(args.input)
if input_path.is_file():
# Process a single PDF file
logger.info(f"Processing file: {input_path}")
extractor.process_pdf(str(input_path), args.output)
elif input_path.is_dir():
# Process all PDF files in the directory
logger.info(f"Processing all PDFs in directory: {input_path}")
pdf_files = list(input_path.glob('*.pdf'))
if not pdf_files:
logger.warning(f"No PDF files found in {input_path}")
return
for pdf_file in pdf_files:
logger.info(f"Processing file: {pdf_file}")
extractor.process_pdf(str(pdf_file), args.output)
else:
logger.error(f"Input path {input_path} does not exist")
sys.exit(1)
else:
logger.error("Please specify a command. Use -h for help.")
sys.exit(1)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment