Last active
April 5, 2025 00:07
-
-
Save brokosz/7dc98479659a838cfdbfa6aa50b2ea9c to your computer and use it in GitHub Desktop.
Some quick scripts to strip plain text out of common file types to support LLM data collections.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import sys | |
| from pathlib import Path | |
| try: | |
| from docx import Document | |
| except ImportError: | |
| print("Error: python-docx package not installed.") | |
| print("Install it with: pip install python-docx") | |
| sys.exit(1) | |
| def extract_text_from_docx(docx_path): | |
| """Extract text from a Word document (.docx)""" | |
| try: | |
| doc = Document(docx_path) | |
| content = [] | |
| # Process paragraphs | |
| for paragraph in doc.paragraphs: | |
| if paragraph.text.strip(): | |
| content.append(paragraph.text.strip()) | |
| # Process tables | |
| for table in doc.tables: | |
| content.append("\nTABLE CONTENT:") | |
| for row in table.rows: | |
| row_text = [] | |
| for cell in row.cells: | |
| cell_text = cell.text.strip() | |
| if cell_text: | |
| row_text.append(cell_text) | |
| if row_text: | |
| content.append(" | ".join(row_text)) | |
| # Add text from headers | |
| for section in doc.sections: | |
| if section.header: | |
| for paragraph in section.header.paragraphs: | |
| if paragraph.text.strip(): | |
| content.append("HEADER: " + paragraph.text.strip()) | |
| # Add text from footers | |
| if section.footer: | |
| for paragraph in section.footer.paragraphs: | |
| if paragraph.text.strip(): | |
| content.append("FOOTER: " + paragraph.text.strip()) | |
| return "\n\n".join(content) | |
| except Exception as e: | |
| return f"Error extracting text: {str(e)}" | |
| def main(): | |
| if len(sys.argv) < 2: | |
| print("Usage: docx2txt <path_to_docx_file> [output_file]") | |
| print("If output_file is not provided, will print to console") | |
| sys.exit(1) | |
| docx_path = sys.argv[1] | |
| if not Path(docx_path).exists(): | |
| print(f"Error: File '{docx_path}' not found") | |
| sys.exit(1) | |
| if not docx_path.lower().endswith('.docx'): | |
| print(f"Warning: File '{docx_path}' doesn't have a .docx extension") | |
| text = extract_text_from_docx(docx_path) | |
| if len(sys.argv) >= 3: | |
| output_path = sys.argv[2] | |
| try: | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write(text) | |
| print(f"Text successfully extracted to {output_path}") | |
| except Exception as e: | |
| print(f"Error writing to output file: {e}") | |
| sys.exit(1) | |
| else: | |
| print(text) | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import sys | |
| import os | |
| from pathlib import Path | |
| try: | |
| import PyPDF2 | |
| except ImportError: | |
| try: | |
| import pip | |
| print("PyPDF2 package not installed. Attempting to install...") | |
| from subprocess import check_call | |
| check_call([sys.executable, '-m', 'pip', 'install', 'PyPDF2']) | |
| import PyPDF2 | |
| print("PyPDF2 successfully installed.") | |
| except Exception as e: | |
| print(f"Error: Failed to install PyPDF2. {str(e)}") | |
| print("Install it manually with: pip install PyPDF2") | |
| sys.exit(1) | |
| def extract_text_from_pdf(pdf_path): | |
| """Extract text from a PDF file""" | |
| try: | |
| text_content = [] | |
| # Open the PDF file | |
| with open(pdf_path, 'rb') as file: | |
| # Create PDF reader object | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| # Get number of pages | |
| num_pages = len(pdf_reader.pages) | |
| text_content.append(f"PDF contains {num_pages} pages.\n") | |
| # Extract text from each page | |
| for page_num in range(num_pages): | |
| page = pdf_reader.pages[page_num] | |
| text_content.append(f"\n\n===== PAGE {page_num + 1} =====\n") | |
| # Extract text from the page | |
| page_text = page.extract_text() | |
| if page_text.strip(): | |
| text_content.append(page_text) | |
| else: | |
| # If no text extracted, the page might be an image | |
| text_content.append("(This page may contain images or scanned text that cannot be extracted)") | |
| return "\n".join(text_content) | |
| except Exception as e: | |
| return f"Error extracting text: {str(e)}" | |
| def main(): | |
| if len(sys.argv) < 2: | |
| print("Usage: pdf2txt <path_to_pdf_file> [output_file]") | |
| print("If output_file is not provided, will print to console") | |
| sys.exit(1) | |
| pdf_path = sys.argv[1] | |
| if not Path(pdf_path).exists(): | |
| print(f"Error: File '{pdf_path}' not found") | |
| sys.exit(1) | |
| if not pdf_path.lower().endswith('.pdf'): | |
| print(f"Warning: File '{pdf_path}' doesn't have a .pdf extension") | |
| text = extract_text_from_pdf(pdf_path) | |
| if len(sys.argv) >= 3: | |
| output_path = sys.argv[2] | |
| try: | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write(text) | |
| print(f"Text successfully extracted to {output_path}") | |
| except Exception as e: | |
| print(f"Error writing to output file: {e}") | |
| sys.exit(1) | |
| else: | |
| print(text) | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import sys | |
| from pathlib import Path | |
| try: | |
| from pptx import Presentation | |
| except ImportError: | |
| print("Error: python-pptx package not installed.") | |
| print("Install it with: pip install python-pptx") | |
| sys.exit(1) | |
| def extract_text_from_shape(shape, text_content): | |
| """Extract text from a shape, handling nested text in groups""" | |
| # Check if shape has text | |
| if hasattr(shape, "text") and shape.text.strip(): | |
| text_content.append(shape.text.strip()) | |
| # Check if shape is a group | |
| if hasattr(shape, "shapes"): | |
| # Process all shapes in the group | |
| for subshape in shape.shapes: | |
| extract_text_from_shape(subshape, text_content) | |
| # Check for tables | |
| if hasattr(shape, "has_table") and shape.has_table: | |
| text_content.append("\nTABLE CONTENT:") | |
| table = shape.table | |
| for row in table.rows: | |
| row_text = [] | |
| for cell in row.cells: | |
| if cell.text.strip(): | |
| row_text.append(cell.text.strip()) | |
| if row_text: | |
| text_content.append(" | ".join(row_text)) | |
| def extract_text_from_pptx(pptx_path): | |
| try: | |
| prs = Presentation(pptx_path) | |
| text_content = [] | |
| for i, slide in enumerate(prs.slides): | |
| slide_text = [] | |
| text_content.append(f"\n\n--- SLIDE {i+1} ---\n") | |
| # Process each shape in the slide | |
| for shape in slide.shapes: | |
| extract_text_from_shape(shape, slide_text) | |
| # Add all text from this slide to the main content | |
| text_content.extend(slide_text) | |
| return "\n".join(text_content) | |
| except Exception as e: | |
| return f"Error extracting text: {str(e)}" | |
| def main(): | |
| if len(sys.argv) < 2: | |
| print("Usage: pptx2txt <path_to_pptx_file> [output_file]") | |
| print("If output_file is not provided, will print to console") | |
| sys.exit(1) | |
| pptx_path = sys.argv[1] | |
| if not Path(pptx_path).exists(): | |
| print(f"Error: File '{pptx_path}' not found") | |
| sys.exit(1) | |
| if not pptx_path.lower().endswith('.pptx'): | |
| print(f"Warning: File '{pptx_path}' doesn't have a .pptx extension") | |
| text = extract_text_from_pptx(pptx_path) | |
| if len(sys.argv) >= 3: | |
| output_path = sys.argv[2] | |
| try: | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write(text) | |
| print(f"Text successfully extracted to {output_path}") | |
| except Exception as e: | |
| print(f"Error writing to output file: {e}") | |
| sys.exit(1) | |
| else: | |
| print(text) | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import sys | |
| from pathlib import Path | |
| try: | |
| import openpyxl | |
| except ImportError: | |
| print("Error: openpyxl package not installed.") | |
| print("Install it with: pip install openpyxl") | |
| sys.exit(1) | |
| def extract_text_from_xlsx(xlsx_path): | |
| """Extract text from an Excel spreadsheet (.xlsx)""" | |
| try: | |
| workbook = openpyxl.load_workbook(xlsx_path, data_only=True) | |
| content = [] | |
| # Process each worksheet | |
| for sheet_name in workbook.sheetnames: | |
| sheet = workbook[sheet_name] | |
| # Add sheet name as header | |
| content.append(f"\n\n==== SHEET: {sheet_name} ====\n") | |
| # Find the used range | |
| max_row = sheet.max_row | |
| max_col = sheet.max_column | |
| # Skip completely empty sheets | |
| if max_row == 1 and max_col == 1 and sheet.cell(1, 1).value is None: | |
| content.append("(Empty Sheet)") | |
| continue | |
| # Process cells | |
| for row in range(1, max_row + 1): | |
| row_values = [] | |
| for col in range(1, max_col + 1): | |
| cell = sheet.cell(row, col) | |
| value = cell.value | |
| # Convert different data types to string | |
| if value is None: | |
| value = "" | |
| elif isinstance(value, (int, float)): | |
| # Handle numeric values | |
| if isinstance(value, int): | |
| value = str(value) | |
| else: | |
| # Format float to avoid excessive decimals | |
| value = f"{value:.6f}".rstrip('0').rstrip('.') | |
| else: | |
| value = str(value) | |
| row_values.append(value) | |
| # Only add non-empty rows | |
| if any(val.strip() for val in row_values): | |
| content.append("\t".join(row_values)) | |
| # Try to add named ranges in a compatible way with different openpyxl versions | |
| try: | |
| if hasattr(workbook, 'defined_names') and workbook.defined_names: | |
| # For newer openpyxl versions | |
| sheet_ranges = [] | |
| for name in workbook.defined_names: | |
| if name in workbook.defined_names: | |
| defined_name = workbook.defined_names[name] | |
| if defined_name.destinations: | |
| sheet_ranges.append(f"{name}: {defined_name.value}") | |
| if sheet_ranges: | |
| content.append("\nNamed Ranges:") | |
| content.extend(sheet_ranges) | |
| except Exception: | |
| # Skip named ranges if there's any error | |
| pass | |
| return "\n".join(content) | |
| except Exception as e: | |
| return f"Error extracting text: {str(e)}" | |
| def main(): | |
| if len(sys.argv) < 2: | |
| print("Usage: xlsx2txt <path_to_xlsx_file> [output_file]") | |
| print("If output_file is not provided, will print to console") | |
| sys.exit(1) | |
| xlsx_path = sys.argv[1] | |
| if not Path(xlsx_path).exists(): | |
| print(f"Error: File '{xlsx_path}' not found") | |
| sys.exit(1) | |
| if not xlsx_path.lower().endswith('.xlsx'): | |
| print(f"Warning: File '{xlsx_path}' doesn't have a .xlsx extension") | |
| text = extract_text_from_xlsx(xlsx_path) | |
| if len(sys.argv) >= 3: | |
| output_path = sys.argv[2] | |
| try: | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write(text) | |
| print(f"Text successfully extracted to {output_path}") | |
| except Exception as e: | |
| print(f"Error writing to output file: {e}") | |
| sys.exit(1) | |
| else: | |
| print(text) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment