Skip to content

Instantly share code, notes, and snippets.

@brokosz
Last active April 5, 2025 00:07
Show Gist options
  • Save brokosz/7dc98479659a838cfdbfa6aa50b2ea9c to your computer and use it in GitHub Desktop.
Save brokosz/7dc98479659a838cfdbfa6aa50b2ea9c to your computer and use it in GitHub Desktop.
Some quick scripts to strip plain text out of common file types to support LLM data collections.
#!/usr/bin/env python3
import sys
from pathlib import Path
try:
from docx import Document
except ImportError:
print("Error: python-docx package not installed.")
print("Install it with: pip install python-docx")
sys.exit(1)
def extract_text_from_docx(docx_path):
"""Extract text from a Word document (.docx)"""
try:
doc = Document(docx_path)
content = []
# Process paragraphs
for paragraph in doc.paragraphs:
if paragraph.text.strip():
content.append(paragraph.text.strip())
# Process tables
for table in doc.tables:
content.append("\nTABLE CONTENT:")
for row in table.rows:
row_text = []
for cell in row.cells:
cell_text = cell.text.strip()
if cell_text:
row_text.append(cell_text)
if row_text:
content.append(" | ".join(row_text))
# Add text from headers
for section in doc.sections:
if section.header:
for paragraph in section.header.paragraphs:
if paragraph.text.strip():
content.append("HEADER: " + paragraph.text.strip())
# Add text from footers
if section.footer:
for paragraph in section.footer.paragraphs:
if paragraph.text.strip():
content.append("FOOTER: " + paragraph.text.strip())
return "\n\n".join(content)
except Exception as e:
return f"Error extracting text: {str(e)}"
def main():
if len(sys.argv) < 2:
print("Usage: docx2txt <path_to_docx_file> [output_file]")
print("If output_file is not provided, will print to console")
sys.exit(1)
docx_path = sys.argv[1]
if not Path(docx_path).exists():
print(f"Error: File '{docx_path}' not found")
sys.exit(1)
if not docx_path.lower().endswith('.docx'):
print(f"Warning: File '{docx_path}' doesn't have a .docx extension")
text = extract_text_from_docx(docx_path)
if len(sys.argv) >= 3:
output_path = sys.argv[2]
try:
with open(output_path, "w", encoding="utf-8") as f:
f.write(text)
print(f"Text successfully extracted to {output_path}")
except Exception as e:
print(f"Error writing to output file: {e}")
sys.exit(1)
else:
print(text)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
import sys
import os
from pathlib import Path
try:
import PyPDF2
except ImportError:
try:
import pip
print("PyPDF2 package not installed. Attempting to install...")
from subprocess import check_call
check_call([sys.executable, '-m', 'pip', 'install', 'PyPDF2'])
import PyPDF2
print("PyPDF2 successfully installed.")
except Exception as e:
print(f"Error: Failed to install PyPDF2. {str(e)}")
print("Install it manually with: pip install PyPDF2")
sys.exit(1)
def extract_text_from_pdf(pdf_path):
"""Extract text from a PDF file"""
try:
text_content = []
# Open the PDF file
with open(pdf_path, 'rb') as file:
# Create PDF reader object
pdf_reader = PyPDF2.PdfReader(file)
# Get number of pages
num_pages = len(pdf_reader.pages)
text_content.append(f"PDF contains {num_pages} pages.\n")
# Extract text from each page
for page_num in range(num_pages):
page = pdf_reader.pages[page_num]
text_content.append(f"\n\n===== PAGE {page_num + 1} =====\n")
# Extract text from the page
page_text = page.extract_text()
if page_text.strip():
text_content.append(page_text)
else:
# If no text extracted, the page might be an image
text_content.append("(This page may contain images or scanned text that cannot be extracted)")
return "\n".join(text_content)
except Exception as e:
return f"Error extracting text: {str(e)}"
def main():
if len(sys.argv) < 2:
print("Usage: pdf2txt <path_to_pdf_file> [output_file]")
print("If output_file is not provided, will print to console")
sys.exit(1)
pdf_path = sys.argv[1]
if not Path(pdf_path).exists():
print(f"Error: File '{pdf_path}' not found")
sys.exit(1)
if not pdf_path.lower().endswith('.pdf'):
print(f"Warning: File '{pdf_path}' doesn't have a .pdf extension")
text = extract_text_from_pdf(pdf_path)
if len(sys.argv) >= 3:
output_path = sys.argv[2]
try:
with open(output_path, "w", encoding="utf-8") as f:
f.write(text)
print(f"Text successfully extracted to {output_path}")
except Exception as e:
print(f"Error writing to output file: {e}")
sys.exit(1)
else:
print(text)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
import sys
from pathlib import Path
try:
from pptx import Presentation
except ImportError:
print("Error: python-pptx package not installed.")
print("Install it with: pip install python-pptx")
sys.exit(1)
def extract_text_from_shape(shape, text_content):
"""Extract text from a shape, handling nested text in groups"""
# Check if shape has text
if hasattr(shape, "text") and shape.text.strip():
text_content.append(shape.text.strip())
# Check if shape is a group
if hasattr(shape, "shapes"):
# Process all shapes in the group
for subshape in shape.shapes:
extract_text_from_shape(subshape, text_content)
# Check for tables
if hasattr(shape, "has_table") and shape.has_table:
text_content.append("\nTABLE CONTENT:")
table = shape.table
for row in table.rows:
row_text = []
for cell in row.cells:
if cell.text.strip():
row_text.append(cell.text.strip())
if row_text:
text_content.append(" | ".join(row_text))
def extract_text_from_pptx(pptx_path):
try:
prs = Presentation(pptx_path)
text_content = []
for i, slide in enumerate(prs.slides):
slide_text = []
text_content.append(f"\n\n--- SLIDE {i+1} ---\n")
# Process each shape in the slide
for shape in slide.shapes:
extract_text_from_shape(shape, slide_text)
# Add all text from this slide to the main content
text_content.extend(slide_text)
return "\n".join(text_content)
except Exception as e:
return f"Error extracting text: {str(e)}"
def main():
if len(sys.argv) < 2:
print("Usage: pptx2txt <path_to_pptx_file> [output_file]")
print("If output_file is not provided, will print to console")
sys.exit(1)
pptx_path = sys.argv[1]
if not Path(pptx_path).exists():
print(f"Error: File '{pptx_path}' not found")
sys.exit(1)
if not pptx_path.lower().endswith('.pptx'):
print(f"Warning: File '{pptx_path}' doesn't have a .pptx extension")
text = extract_text_from_pptx(pptx_path)
if len(sys.argv) >= 3:
output_path = sys.argv[2]
try:
with open(output_path, "w", encoding="utf-8") as f:
f.write(text)
print(f"Text successfully extracted to {output_path}")
except Exception as e:
print(f"Error writing to output file: {e}")
sys.exit(1)
else:
print(text)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
import sys
from pathlib import Path
try:
import openpyxl
except ImportError:
print("Error: openpyxl package not installed.")
print("Install it with: pip install openpyxl")
sys.exit(1)
def extract_text_from_xlsx(xlsx_path):
"""Extract text from an Excel spreadsheet (.xlsx)"""
try:
workbook = openpyxl.load_workbook(xlsx_path, data_only=True)
content = []
# Process each worksheet
for sheet_name in workbook.sheetnames:
sheet = workbook[sheet_name]
# Add sheet name as header
content.append(f"\n\n==== SHEET: {sheet_name} ====\n")
# Find the used range
max_row = sheet.max_row
max_col = sheet.max_column
# Skip completely empty sheets
if max_row == 1 and max_col == 1 and sheet.cell(1, 1).value is None:
content.append("(Empty Sheet)")
continue
# Process cells
for row in range(1, max_row + 1):
row_values = []
for col in range(1, max_col + 1):
cell = sheet.cell(row, col)
value = cell.value
# Convert different data types to string
if value is None:
value = ""
elif isinstance(value, (int, float)):
# Handle numeric values
if isinstance(value, int):
value = str(value)
else:
# Format float to avoid excessive decimals
value = f"{value:.6f}".rstrip('0').rstrip('.')
else:
value = str(value)
row_values.append(value)
# Only add non-empty rows
if any(val.strip() for val in row_values):
content.append("\t".join(row_values))
# Try to add named ranges in a compatible way with different openpyxl versions
try:
if hasattr(workbook, 'defined_names') and workbook.defined_names:
# For newer openpyxl versions
sheet_ranges = []
for name in workbook.defined_names:
if name in workbook.defined_names:
defined_name = workbook.defined_names[name]
if defined_name.destinations:
sheet_ranges.append(f"{name}: {defined_name.value}")
if sheet_ranges:
content.append("\nNamed Ranges:")
content.extend(sheet_ranges)
except Exception:
# Skip named ranges if there's any error
pass
return "\n".join(content)
except Exception as e:
return f"Error extracting text: {str(e)}"
def main():
if len(sys.argv) < 2:
print("Usage: xlsx2txt <path_to_xlsx_file> [output_file]")
print("If output_file is not provided, will print to console")
sys.exit(1)
xlsx_path = sys.argv[1]
if not Path(xlsx_path).exists():
print(f"Error: File '{xlsx_path}' not found")
sys.exit(1)
if not xlsx_path.lower().endswith('.xlsx'):
print(f"Warning: File '{xlsx_path}' doesn't have a .xlsx extension")
text = extract_text_from_xlsx(xlsx_path)
if len(sys.argv) >= 3:
output_path = sys.argv[2]
try:
with open(output_path, "w", encoding="utf-8") as f:
f.write(text)
print(f"Text successfully extracted to {output_path}")
except Exception as e:
print(f"Error writing to output file: {e}")
sys.exit(1)
else:
print(text)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment