guz-anton · May 19, 2025 01:28
diff --git a/ReadMe.md b/ReadMe.md
diff --git a/pdf_table_extractor.py b/pdf_table_extractor.py
 #!/usr/bin/env python3
 """
 PDF to CSV Extractor with python

 A CLI tool that extracts tables from PDFs and saves them as CSV files.
 Built using LlamaCPP for the LLM backend and other open-source libraries.

 Usage:
    python pdf_table_extractor.py extract --input sample.pdf --output tables/ --model_path llama-2-7b-chat.Q4_0.gguf
 """

 import argparse
 import os
 import json
 import pandas as pd
 from typing import List, Dict, Any
 from llama_cpp import Llama
 import pypdf
 import re
 import csv
 import sys
 from pathlib import Path
 import logging

 # Set up logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger('pdf_table_extractor')

 class PDFTableExtractor:
    def __init__(self, model_path: str, n_ctx: int = 2048, n_gpu_layers: int = 0):
        """
        Initialize the PDF Table Extractor with the LLM model.
        
        Args:
            model_path: Path to the LLM model file
            n_ctx: Context window size
            n_gpu_layers: Number of layers to offload to GPU
        """
        logger.info(f"Loading model from {model_path}")
        try:
            self.llm = Llama(
                model_path=model_path,
                n_ctx=n_ctx,
                n_gpu_layers=n_gpu_layers
            )
            logger.info("Model loaded successfully")
        except Exception as e:
            logger.error(f"Failed to load model: {str(e)}")
            sys.exit(1)

    def extract_text_from_pdf(self, pdf_path: str) -> Dict[int, str]:
        """
        Extract text from each page of the PDF.
        
        Args:
            pdf_path: Path to the PDF file
            
        Returns:
            Dictionary with page numbers as keys and page texts as values
        """
        logger.info(f"Extracting text from {pdf_path}")
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = pypdf.PdfReader(file)
                pages = {}
                
                for i in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[i]
                    pages[i+1] = page.extract_text()
                    
            return pages
        except Exception as e:
            logger.error(f"Failed to extract text from PDF: {str(e)}")
            sys.exit(1)

    def detect_tables(self, page_text: str) -> bool:
        """
        Detect if a page contains tables using the LLM.
        
        Args:
            page_text: Text content of a page
            
        Returns:
            Boolean indicating if tables are present
        """
        prompt = f"""
        The following is text extracted from a PDF page. Does this text contain a table? 
        Respond with CONTAINS_TABLE if the text seems to include tabular data or NO_TABLE if not.
        
        TEXT:
        {page_text[:1000]}  # limiting to first 1000 chars for model context
        
        RESPONSE:
        """
        
        response = self.llm(prompt, max_tokens=50)
        return "CONTAINS_TABLE" in response["choices"][0]["text"]

    def parse_table(self, page_text: str) -> List[List[str]]:
        """
        Parse table structure from text using the LLM.
        
        Args:
            page_text: Text content containing tables
            
        Returns:
            List of lists representing table rows and columns
        """
        prompt = f"""
        The following text contains tabular data extracted from a PDF. 
        Convert this data into a structured table format. 
        Output the table as a JSON array of arrays where each inner array represents a row of the table.
        Only output valid JSON, nothing else.
        
        TEXT:
        {page_text[:3000]}  # limiting to first 3000 chars for model context
        
        JSON ARRAY:
        """
        
        try:
            response = self.llm(prompt, max_tokens=2000)
            output_text = response["choices"][0]["text"].strip()
            
            # Try to extract JSON using regex
            match = re.search(r'\[\s*\[.*\]\s*\]', output_text, re.DOTALL)
            if match:
                output_text = match.group(0)
            
            # Parse JSON array
            table_data = json.loads(output_text)
            return table_data
        except json.JSONDecodeError:
            logger.warning("Failed to parse table as JSON. Attempting alternative parsing method...")
            # Fallback to a simpler parsing strategy
            return self._fallback_table_parsing(page_text)
    
    def _fallback_table_parsing(self, page_text: str) -> List[List[str]]:
        """
        Fallback method to parse tables when JSON parsing fails.
        
        Args:
            page_text: Text content containing tables
            
        Returns:
            List of lists representing table rows and columns
        """
        # Simple heuristic: Split by newlines and then by multiple spaces
        rows = []
        lines = page_text.split('\n')
        
        for line in lines:
            if line.strip():
                # Split by multiple spaces (2 or more)
                cells = re.split(r'\s{2,}', line.strip())
                rows.append([cell.strip() for cell in cells])
        
        return rows

    def save_as_csv(self, table_data: List[List[str]], output_path: str):
        """
        Save the extracted table data to a CSV file.
        
        Args:
            table_data: List of lists representing table rows and columns
            output_path: Path to save the CSV file
        """
        try:
            with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                for row in table_data:
                    writer.writerow(row)
            logger.info(f"Saved table to {output_path}")
        except Exception as e:
            logger.error(f"Failed to save CSV: {str(e)}")

    def process_pdf(self, pdf_path: str, output_dir: str):
        """
        Process a PDF file and extract tables to CSV files.
        
        Args:
            pdf_path: Path to the PDF file
            output_dir: Directory to save the CSV files
        """
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Extract text from PDF
        pages = self.extract_text_from_pdf(pdf_path)
        
        # Track how many tables we've extracted
        table_count = 0
        
        # Process each page
        for page_num, page_text in pages.items():
            logger.info(f"Processing page {page_num}")
            
            # Check if page contains tables
            if self.detect_tables(page_text):
                logger.info(f"Table detected on page {page_num}")
                
                # Parse table
                table_data = self.parse_table(page_text)
                
                # Save table to CSV
                pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
                output_path = os.path.join(output_dir, f"{pdf_name}_page{page_num}_table{table_count}.csv")
                self.save_as_csv(table_data, output_path)
                table_count += 1
            else:
                logger.info(f"No tables detected on page {page_num}")
        
        logger.info(f"Extraction complete. Found {table_count} tables.")
        return table_count

 def parse_args():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(description='Extract tables from PDFs and save as CSV files.')
    subparsers = parser.add_subparsers(dest='command', help='Command to run')
    
    # Extract command
    extract_parser = subparsers.add_parser('extract', help='Extract tables from PDFs')
    extract_parser.add_argument('--input', '-i', required=True, help='Path to input PDF file or directory')
    extract_parser.add_argument('--output', '-o', required=True, help='Directory to save CSV files')
    extract_parser.add_argument('--model_path', '-m', required=True, help='Path to the LLM model file')
    extract_parser.add_argument('--n_ctx', type=int, default=2048, help='Context window size')
    extract_parser.add_argument('--n_gpu_layers', type=int, default=0, help='Number of layers to offload to GPU')
    
    return parser.parse_args()

 def main():
    """Main entry point for the script."""
    args = parse_args()
    
    if args.command == 'extract':
        extractor = PDFTableExtractor(
            model_path=args.model_path,
            n_ctx=args.n_ctx,
            n_gpu_layers=args.n_gpu_layers
        )
        
        input_path = Path(args.input)
        
        if input_path.is_file():
            # Process a single PDF file
            logger.info(f"Processing file: {input_path}")
            extractor.process_pdf(str(input_path), args.output)
        elif input_path.is_dir():
            # Process all PDF files in the directory
            logger.info(f"Processing all PDFs in directory: {input_path}")
            pdf_files = list(input_path.glob('*.pdf'))
            
            if not pdf_files:
                logger.warning(f"No PDF files found in {input_path}")
                return
            
            for pdf_file in pdf_files:
                logger.info(f"Processing file: {pdf_file}")
                extractor.process_pdf(str(pdf_file), args.output)
        else:
            logger.error(f"Input path {input_path} does not exist")
            sys.exit(1)
    else:
        logger.error("Please specify a command. Use -h for help.")
        sys.exit(1)

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	PDF to CSV Extractor with python

	A CLI tool that extracts tables from PDFs and saves them as CSV files.
	Built using LlamaCPP for the LLM backend and other open-source libraries.

	Usage:
	python pdf_table_extractor.py extract --input sample.pdf --output tables/ --model_path llama-2-7b-chat.Q4_0.gguf
	"""

	import argparse
	import os
	import json
	import pandas as pd
	from typing import List, Dict, Any
	from llama_cpp import Llama
	import pypdf
	import re
	import csv
	import sys
	from pathlib import Path
	import logging

	# Set up logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger('pdf_table_extractor')

	class PDFTableExtractor:
	def __init__(self, model_path: str, n_ctx: int = 2048, n_gpu_layers: int = 0):
	"""
	Initialize the PDF Table Extractor with the LLM model.

	Args:
	model_path: Path to the LLM model file
	n_ctx: Context window size
	n_gpu_layers: Number of layers to offload to GPU
	"""
	logger.info(f"Loading model from {model_path}")
	try:
	self.llm = Llama(
	model_path=model_path,
	n_ctx=n_ctx,
	n_gpu_layers=n_gpu_layers
	)
	logger.info("Model loaded successfully")
	except Exception as e:
	logger.error(f"Failed to load model: {str(e)}")
	sys.exit(1)

	def extract_text_from_pdf(self, pdf_path: str) -> Dict[int, str]:
	"""
	Extract text from each page of the PDF.

	Args:
	pdf_path: Path to the PDF file

	Returns:
	Dictionary with page numbers as keys and page texts as values
	"""
	logger.info(f"Extracting text from {pdf_path}")
	try:
	with open(pdf_path, 'rb') as file:
	pdf_reader = pypdf.PdfReader(file)
	pages = {}

	for i in range(len(pdf_reader.pages)):
	page = pdf_reader.pages[i]
	pages[i+1] = page.extract_text()

	return pages
	except Exception as e:
	logger.error(f"Failed to extract text from PDF: {str(e)}")
	sys.exit(1)

	def detect_tables(self, page_text: str) -> bool:
	"""
	Detect if a page contains tables using the LLM.

	Args:
	page_text: Text content of a page

	Returns:
	Boolean indicating if tables are present
	"""
	prompt = f"""
	The following is text extracted from a PDF page. Does this text contain a table?
	Respond with CONTAINS_TABLE if the text seems to include tabular data or NO_TABLE if not.

	TEXT:
	{page_text[:1000]} # limiting to first 1000 chars for model context

	RESPONSE:
	"""

	response = self.llm(prompt, max_tokens=50)
	return "CONTAINS_TABLE" in response["choices"][0]["text"]

	def parse_table(self, page_text: str) -> List[List[str]]:
	"""
	Parse table structure from text using the LLM.

	Args:
	page_text: Text content containing tables

	Returns:
	List of lists representing table rows and columns
	"""
	prompt = f"""
	The following text contains tabular data extracted from a PDF.
	Convert this data into a structured table format.
	Output the table as a JSON array of arrays where each inner array represents a row of the table.
	Only output valid JSON, nothing else.

	TEXT:
	{page_text[:3000]} # limiting to first 3000 chars for model context

	JSON ARRAY:
	"""

	try:
	response = self.llm(prompt, max_tokens=2000)
	output_text = response["choices"][0]["text"].strip()

	# Try to extract JSON using regex
	match = re.search(r'\[\s\[.\]\s*\]', output_text, re.DOTALL)
	if match:
	output_text = match.group(0)

	# Parse JSON array
	table_data = json.loads(output_text)
	return table_data
	except json.JSONDecodeError:
	logger.warning("Failed to parse table as JSON. Attempting alternative parsing method...")
	# Fallback to a simpler parsing strategy
	return self._fallback_table_parsing(page_text)

	def _fallback_table_parsing(self, page_text: str) -> List[List[str]]:
	"""
	Fallback method to parse tables when JSON parsing fails.

	Args:
	page_text: Text content containing tables

	Returns:
	List of lists representing table rows and columns
	"""
	# Simple heuristic: Split by newlines and then by multiple spaces
	rows = []
	lines = page_text.split('\n')

	for line in lines:
	if line.strip():
	# Split by multiple spaces (2 or more)
	cells = re.split(r'\s{2,}', line.strip())
	rows.append([cell.strip() for cell in cells])

	return rows

	def save_as_csv(self, table_data: List[List[str]], output_path: str):
	"""
	Save the extracted table data to a CSV file.

	Args:
	table_data: List of lists representing table rows and columns
	output_path: Path to save the CSV file
	"""
	try:
	with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
	writer = csv.writer(csvfile)
	for row in table_data:
	writer.writerow(row)
	logger.info(f"Saved table to {output_path}")
	except Exception as e:
	logger.error(f"Failed to save CSV: {str(e)}")

	def process_pdf(self, pdf_path: str, output_dir: str):
	"""
	Process a PDF file and extract tables to CSV files.

	Args:
	pdf_path: Path to the PDF file
	output_dir: Directory to save the CSV files
	"""
	# Create output directory if it doesn't exist
	os.makedirs(output_dir, exist_ok=True)

	# Extract text from PDF
	pages = self.extract_text_from_pdf(pdf_path)

	# Track how many tables we've extracted
	table_count = 0

	# Process each page
	for page_num, page_text in pages.items():
	logger.info(f"Processing page {page_num}")

	# Check if page contains tables
	if self.detect_tables(page_text):
	logger.info(f"Table detected on page {page_num}")

	# Parse table
	table_data = self.parse_table(page_text)

	# Save table to CSV
	pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
	output_path = os.path.join(output_dir, f"{pdf_name}_page{page_num}_table{table_count}.csv")
	self.save_as_csv(table_data, output_path)
	table_count += 1
	else:
	logger.info(f"No tables detected on page {page_num}")

	logger.info(f"Extraction complete. Found {table_count} tables.")
	return table_count

	def parse_args():
	"""Parse command line arguments."""
	parser = argparse.ArgumentParser(description='Extract tables from PDFs and save as CSV files.')
	subparsers = parser.add_subparsers(dest='command', help='Command to run')

	# Extract command
	extract_parser = subparsers.add_parser('extract', help='Extract tables from PDFs')
	extract_parser.add_argument('--input', '-i', required=True, help='Path to input PDF file or directory')
	extract_parser.add_argument('--output', '-o', required=True, help='Directory to save CSV files')
	extract_parser.add_argument('--model_path', '-m', required=True, help='Path to the LLM model file')
	extract_parser.add_argument('--n_ctx', type=int, default=2048, help='Context window size')
	extract_parser.add_argument('--n_gpu_layers', type=int, default=0, help='Number of layers to offload to GPU')

	return parser.parse_args()

	def main():
	"""Main entry point for the script."""
	args = parse_args()

	if args.command == 'extract':
	extractor = PDFTableExtractor(
	model_path=args.model_path,
	n_ctx=args.n_ctx,
	n_gpu_layers=args.n_gpu_layers
	)

	input_path = Path(args.input)

	if input_path.is_file():
	# Process a single PDF file
	logger.info(f"Processing file: {input_path}")
	extractor.process_pdf(str(input_path), args.output)
	elif input_path.is_dir():
	# Process all PDF files in the directory
	logger.info(f"Processing all PDFs in directory: {input_path}")
	pdf_files = list(input_path.glob('*.pdf'))

	if not pdf_files:
	logger.warning(f"No PDF files found in {input_path}")
	return

	for pdf_file in pdf_files:
	logger.info(f"Processing file: {pdf_file}")
	extractor.process_pdf(str(pdf_file), args.output)
	else:
	logger.error(f"Input path {input_path} does not exist")
	sys.exit(1)
	else:
	logger.error("Please specify a command. Use -h for help.")
	sys.exit(1)

	if __name__ == "__main__":
	main()