brokosz · April 5, 2025 00:07
diff --git a/docx2txt.py b/docx2txt.py
 #!/usr/bin/env python3

 import sys
 from pathlib import Path
 try:
    from docx import Document
 except ImportError:
    print("Error: python-docx package not installed.")
    print("Install it with: pip install python-docx")
    sys.exit(1)

 def extract_text_from_docx(docx_path):
    """Extract text from a Word document (.docx)"""
    try:
        doc = Document(docx_path)
        content = []

        # Process paragraphs
        for paragraph in doc.paragraphs:
            if paragraph.text.strip():
                content.append(paragraph.text.strip())

        # Process tables
        for table in doc.tables:
            content.append("\nTABLE CONTENT:")
            for row in table.rows:
                row_text = []
                for cell in row.cells:
                    cell_text = cell.text.strip()
                    if cell_text:
                        row_text.append(cell_text)
                if row_text:
                    content.append(" | ".join(row_text))

        # Add text from headers
        for section in doc.sections:
            if section.header:
                for paragraph in section.header.paragraphs:
                    if paragraph.text.strip():
                        content.append("HEADER: " + paragraph.text.strip())

            # Add text from footers
            if section.footer:
                for paragraph in section.footer.paragraphs:
                    if paragraph.text.strip():
                        content.append("FOOTER: " + paragraph.text.strip())

        return "\n\n".join(content)
    except Exception as e:
        return f"Error extracting text: {str(e)}"

 def main():
    if len(sys.argv) < 2:
        print("Usage: docx2txt <path_to_docx_file> [output_file]")
        print("If output_file is not provided, will print to console")
        sys.exit(1)

    docx_path = sys.argv[1]

    if not Path(docx_path).exists():
        print(f"Error: File '{docx_path}' not found")
        sys.exit(1)

    if not docx_path.lower().endswith('.docx'):
        print(f"Warning: File '{docx_path}' doesn't have a .docx extension")

    text = extract_text_from_docx(docx_path)

    if len(sys.argv) >= 3:
        output_path = sys.argv[2]
        try:
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(text)
            print(f"Text successfully extracted to {output_path}")
        except Exception as e:
            print(f"Error writing to output file: {e}")
            sys.exit(1)
    else:
        print(text)

 if __name__ == "__main__":
    main()
diff --git a/pdf2txt.py b/pdf2txt.py
 #!/usr/bin/env python3

 import sys
 import os
 from pathlib import Path
 try:
    import PyPDF2
 except ImportError:
    try:
        import pip
        print("PyPDF2 package not installed. Attempting to install...")
        from subprocess import check_call
        check_call([sys.executable, '-m', 'pip', 'install', 'PyPDF2'])
        import PyPDF2
        print("PyPDF2 successfully installed.")
    except Exception as e:
        print(f"Error: Failed to install PyPDF2. {str(e)}")
        print("Install it manually with: pip install PyPDF2")
        sys.exit(1)

 def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file"""
    try:
        text_content = []

        # Open the PDF file
        with open(pdf_path, 'rb') as file:
            # Create PDF reader object
            pdf_reader = PyPDF2.PdfReader(file)

            # Get number of pages
            num_pages = len(pdf_reader.pages)
            text_content.append(f"PDF contains {num_pages} pages.\n")

            # Extract text from each page
            for page_num in range(num_pages):
                page = pdf_reader.pages[page_num]
                text_content.append(f"\n\n===== PAGE {page_num + 1} =====\n")

                # Extract text from the page
                page_text = page.extract_text()

                if page_text.strip():
                    text_content.append(page_text)
                else:
                    # If no text extracted, the page might be an image
                    text_content.append("(This page may contain images or scanned text that cannot be extracted)")

        return "\n".join(text_content)
    except Exception as e:
        return f"Error extracting text: {str(e)}"

 def main():
    if len(sys.argv) < 2:
        print("Usage: pdf2txt <path_to_pdf_file> [output_file]")
        print("If output_file is not provided, will print to console")
        sys.exit(1)

    pdf_path = sys.argv[1]

    if not Path(pdf_path).exists():
        print(f"Error: File '{pdf_path}' not found")
        sys.exit(1)

    if not pdf_path.lower().endswith('.pdf'):
        print(f"Warning: File '{pdf_path}' doesn't have a .pdf extension")

    text = extract_text_from_pdf(pdf_path)

    if len(sys.argv) >= 3:
        output_path = sys.argv[2]
        try:
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(text)
            print(f"Text successfully extracted to {output_path}")
        except Exception as e:
            print(f"Error writing to output file: {e}")
            sys.exit(1)
    else:
        print(text)

 if __name__ == "__main__":
    main()
diff --git a/pptx2txt.py b/pptx2txt.py
 #!/usr/bin/env python3

 import sys
 from pathlib import Path
 try:
    from pptx import Presentation
 except ImportError:
    print("Error: python-pptx package not installed.")
    print("Install it with: pip install python-pptx")
    sys.exit(1)

 def extract_text_from_shape(shape, text_content):
    """Extract text from a shape, handling nested text in groups"""
    # Check if shape has text
    if hasattr(shape, "text") and shape.text.strip():
        text_content.append(shape.text.strip())

    # Check if shape is a group
    if hasattr(shape, "shapes"):
        # Process all shapes in the group
        for subshape in shape.shapes:
            extract_text_from_shape(subshape, text_content)

    # Check for tables
    if hasattr(shape, "has_table") and shape.has_table:
        text_content.append("\nTABLE CONTENT:")
        table = shape.table
        for row in table.rows:
            row_text = []
            for cell in row.cells:
                if cell.text.strip():
                    row_text.append(cell.text.strip())
            if row_text:
                text_content.append(" | ".join(row_text))

 def extract_text_from_pptx(pptx_path):
    try:
        prs = Presentation(pptx_path)
        text_content = []

        for i, slide in enumerate(prs.slides):
            slide_text = []
            text_content.append(f"\n\n--- SLIDE {i+1} ---\n")

            # Process each shape in the slide
            for shape in slide.shapes:
                extract_text_from_shape(shape, slide_text)

            # Add all text from this slide to the main content
            text_content.extend(slide_text)

        return "\n".join(text_content)
    except Exception as e:
        return f"Error extracting text: {str(e)}"

 def main():
    if len(sys.argv) < 2:
        print("Usage: pptx2txt <path_to_pptx_file> [output_file]")
        print("If output_file is not provided, will print to console")
        sys.exit(1)

    pptx_path = sys.argv[1]

    if not Path(pptx_path).exists():
        print(f"Error: File '{pptx_path}' not found")
        sys.exit(1)

    if not pptx_path.lower().endswith('.pptx'):
        print(f"Warning: File '{pptx_path}' doesn't have a .pptx extension")

    text = extract_text_from_pptx(pptx_path)

    if len(sys.argv) >= 3:
        output_path = sys.argv[2]
        try:
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(text)
            print(f"Text successfully extracted to {output_path}")
        except Exception as e:
            print(f"Error writing to output file: {e}")
            sys.exit(1)
    else:
        print(text)

 if __name__ == "__main__":
    main()
diff --git a/xlsx2txt.py b/xlsx2txt.py
 #!/usr/bin/env python3

 import sys
 from pathlib import Path
 try:
    import openpyxl
 except ImportError:
    print("Error: openpyxl package not installed.")
    print("Install it with: pip install openpyxl")
    sys.exit(1)

 def extract_text_from_xlsx(xlsx_path):
    """Extract text from an Excel spreadsheet (.xlsx)"""
    try:
        workbook = openpyxl.load_workbook(xlsx_path, data_only=True)
        content = []

        # Process each worksheet
        for sheet_name in workbook.sheetnames:
            sheet = workbook[sheet_name]

            # Add sheet name as header
            content.append(f"\n\n==== SHEET: {sheet_name} ====\n")

            # Find the used range
            max_row = sheet.max_row
            max_col = sheet.max_column

            # Skip completely empty sheets
            if max_row == 1 and max_col == 1 and sheet.cell(1, 1).value is None:
                content.append("(Empty Sheet)")
                continue

            # Process cells
            for row in range(1, max_row + 1):
                row_values = []

                for col in range(1, max_col + 1):
                    cell = sheet.cell(row, col)
                    value = cell.value

                    # Convert different data types to string
                    if value is None:
                        value = ""
                    elif isinstance(value, (int, float)):
                        # Handle numeric values
                        if isinstance(value, int):
                            value = str(value)
                        else:
                            # Format float to avoid excessive decimals
                            value = f"{value:.6f}".rstrip('0').rstrip('.')
                    else:
                        value = str(value)

                    row_values.append(value)

                # Only add non-empty rows
                if any(val.strip() for val in row_values):
                    content.append("\t".join(row_values))

            # Try to add named ranges in a compatible way with different openpyxl versions
            try:
                if hasattr(workbook, 'defined_names') and workbook.defined_names:
                    # For newer openpyxl versions
                    sheet_ranges = []
                    for name in workbook.defined_names:
                        if name in workbook.defined_names:
                            defined_name = workbook.defined_names[name]
                            if defined_name.destinations:
                                sheet_ranges.append(f"{name}: {defined_name.value}")

                    if sheet_ranges:
                        content.append("\nNamed Ranges:")
                        content.extend(sheet_ranges)
            except Exception:
                # Skip named ranges if there's any error
                pass

        return "\n".join(content)
    except Exception as e:
        return f"Error extracting text: {str(e)}"

 def main():
    if len(sys.argv) < 2:
        print("Usage: xlsx2txt <path_to_xlsx_file> [output_file]")
        print("If output_file is not provided, will print to console")
        sys.exit(1)

    xlsx_path = sys.argv[1]

    if not Path(xlsx_path).exists():
        print(f"Error: File '{xlsx_path}' not found")
        sys.exit(1)

    if not xlsx_path.lower().endswith('.xlsx'):
        print(f"Warning: File '{xlsx_path}' doesn't have a .xlsx extension")

    text = extract_text_from_xlsx(xlsx_path)

    if len(sys.argv) >= 3:
        output_path = sys.argv[2]
        try:
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(text)
            print(f"Text successfully extracted to {output_path}")
        except Exception as e:
            print(f"Error writing to output file: {e}")
            sys.exit(1)
    else:
        print(text)

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	import sys
	from pathlib import Path
	try:
	from docx import Document
	except ImportError:
	print("Error: python-docx package not installed.")
	print("Install it with: pip install python-docx")
	sys.exit(1)

	def extract_text_from_docx(docx_path):
	"""Extract text from a Word document (.docx)"""
	try:
	doc = Document(docx_path)
	content = []

	# Process paragraphs
	for paragraph in doc.paragraphs:
	if paragraph.text.strip():
	content.append(paragraph.text.strip())

	# Process tables
	for table in doc.tables:
	content.append("\nTABLE CONTENT:")
	for row in table.rows:
	row_text = []
	for cell in row.cells:
	cell_text = cell.text.strip()
	if cell_text:
	row_text.append(cell_text)
	if row_text:
	content.append(" \| ".join(row_text))

	# Add text from headers
	for section in doc.sections:
	if section.header:
	for paragraph in section.header.paragraphs:
	if paragraph.text.strip():
	content.append("HEADER: " + paragraph.text.strip())

	# Add text from footers
	if section.footer:
	for paragraph in section.footer.paragraphs:
	if paragraph.text.strip():
	content.append("FOOTER: " + paragraph.text.strip())

	return "\n\n".join(content)
	except Exception as e:
	return f"Error extracting text: {str(e)}"

	def main():
	if len(sys.argv) < 2:
	print("Usage: docx2txt <path_to_docx_file> [output_file]")
	print("If output_file is not provided, will print to console")
	sys.exit(1)

	docx_path = sys.argv[1]

	if not Path(docx_path).exists():
	print(f"Error: File '{docx_path}' not found")
	sys.exit(1)

	if not docx_path.lower().endswith('.docx'):
	print(f"Warning: File '{docx_path}' doesn't have a .docx extension")

	text = extract_text_from_docx(docx_path)

	if len(sys.argv) >= 3:
	output_path = sys.argv[2]
	try:
	with open(output_path, "w", encoding="utf-8") as f:
	f.write(text)
	print(f"Text successfully extracted to {output_path}")
	except Exception as e:
	print(f"Error writing to output file: {e}")
	sys.exit(1)
	else:
	print(text)

	if __name__ == "__main__":
	main()
	#!/usr/bin/env python3

	import sys
	import os
	from pathlib import Path
	try:
	import PyPDF2
	except ImportError:
	try:
	import pip
	print("PyPDF2 package not installed. Attempting to install...")
	from subprocess import check_call
	check_call([sys.executable, '-m', 'pip', 'install', 'PyPDF2'])
	import PyPDF2
	print("PyPDF2 successfully installed.")
	except Exception as e:
	print(f"Error: Failed to install PyPDF2. {str(e)}")
	print("Install it manually with: pip install PyPDF2")
	sys.exit(1)

	def extract_text_from_pdf(pdf_path):
	"""Extract text from a PDF file"""
	try:
	text_content = []

	# Open the PDF file
	with open(pdf_path, 'rb') as file:
	# Create PDF reader object
	pdf_reader = PyPDF2.PdfReader(file)

	# Get number of pages
	num_pages = len(pdf_reader.pages)
	text_content.append(f"PDF contains {num_pages} pages.\n")

	# Extract text from each page
	for page_num in range(num_pages):
	page = pdf_reader.pages[page_num]
	text_content.append(f"\n\n===== PAGE {page_num + 1} =====\n")

	# Extract text from the page
	page_text = page.extract_text()

	if page_text.strip():
	text_content.append(page_text)
	else:
	# If no text extracted, the page might be an image
	text_content.append("(This page may contain images or scanned text that cannot be extracted)")

	return "\n".join(text_content)
	except Exception as e:
	return f"Error extracting text: {str(e)}"

	def main():
	if len(sys.argv) < 2:
	print("Usage: pdf2txt <path_to_pdf_file> [output_file]")
	print("If output_file is not provided, will print to console")
	sys.exit(1)

	pdf_path = sys.argv[1]

	if not Path(pdf_path).exists():
	print(f"Error: File '{pdf_path}' not found")
	sys.exit(1)

	if not pdf_path.lower().endswith('.pdf'):
	print(f"Warning: File '{pdf_path}' doesn't have a .pdf extension")

	text = extract_text_from_pdf(pdf_path)

	if len(sys.argv) >= 3:
	output_path = sys.argv[2]
	try:
	with open(output_path, "w", encoding="utf-8") as f:
	f.write(text)
	print(f"Text successfully extracted to {output_path}")
	except Exception as e:
	print(f"Error writing to output file: {e}")
	sys.exit(1)
	else:
	print(text)

	if __name__ == "__main__":
	main()