mshafiee · May 30, 2025 08:40
diff --git a/collect_text_files_content.py b/collect_text_files_content.py
 import os
 import sys
 from pathlib import Path
 import mimetypes

 # Define sets of common folders and files to exclude for efficiency
 EXCLUDED_DIRS = {
    ".git",         # Git version control
    ".vscode",      # Visual Studio Code IDE
    ".idea",        # JetBrains IDEs (IntelliJ, PyCharm, etc.)
    "node_modules", # NodeJS dependencies
    "venv",         # Python virtual environments
    "__pycache__",  # Python bytecode cache
    "build",        # Common build output directory
    "dist",         # Common distribution output directory
    ".svn",         # Subversion version control
    "target",       # Common build output (Java/Rust)
    "out",          # Common build output
    "bin",          # Common compiled binaries directory
    "obj",          # Common compiled objects directory
    ".gradle",      # Gradle build system
    ".mvn",         # Maven build system
 }

 EXCLUDED_FILES = {
    ".env",         # Environment variables
    "code.md",      # The output file itself
    Path(__file__).name, # The script file itself
    ".DS_Store",    # macOS specific file
    "Thumbs.db",    # Windows specific file
    ".gitignore",   # Git ignore file
    ".gitattributes", # Git attributes file
    ".project",     # Eclipse project file
    ".classpath",   # Eclipse classpath file
    ".settings",    # Eclipse settings directory (though handled by EXCLUDED_DIRS if a dir)
    "*.lock",       # Dependency lock files (e.g., package-lock.json, poetry.lock) - handled by suffix check
 }

 EXCLUDED_SUFFIXES = {
    ".iml",         # JetBrains module file
    ".log",         # Log files
    ".lock",        # Lock files
    ".swp",         # Vim swap files
    ".swo",         # Vim swap files
    # Add common binary/media types if needed, though try/except handles most
    ".exe", ".dll", ".so", ".a", ".lib", ".o",
    ".jar", ".war", ".ear",
    ".zip", ".tar", ".gz", ".rar", ".7z",
    ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".svg",
    ".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
    ".mp3", ".wav", ".mp4", ".mov", ".avi",
 }


 def determine_code_block_language(file_path: Path) -> str:
    """
    Attempt to guess the programming language based on file extension.

    Args:
        file_path (Path): The path to the file.

    Returns:
        str: A language identifier for markdown code blocks (e.g., 'python', 'go').
    """
    extension_to_language = {
        # Python
        '.py': 'python', '.pyw': 'python', '.pyi': 'python',

        # JavaScript
        '.js': 'javascript', '.mjs': 'javascript', '.cjs': 'javascript',

        # TypeScript
        '.ts': 'typescript', '.tsx': 'typescript',

        # Java
        '.java': 'java', '.jsp': 'java',

        # C
        '.c': 'c', '.h': 'c',

        # C++
        '.cpp': 'cpp', '.cxx': 'cpp', '.cc': 'cpp', '.hpp': 'cpp', '.hxx': 'cpp', '.hh': 'cpp',

        # C#
        '.cs': 'csharp', '.csx': 'csharp',

        # Go
        '.go': 'go',

        # Ruby
        '.rb': 'ruby', '.erb': 'ruby', '.rake': 'ruby',

        # PHP
        '.php': 'php', '.phtml': 'php',

        # HTML
        '.html': 'html', '.htm': 'html',

        # CSS
        '.css': 'css', '.scss': 'css', '.sass': 'css',

        # JSON
        '.json': 'json', '.geojson': 'json', '.jsonld': 'json',

        # XML
        '.xml': 'xml', '.xsl': 'xml', '.xsd': 'xml', '.rss': 'xml',

        # Shell/Bash
        '.sh': 'bash', '.bash': 'bash', '.zsh': 'bash', '.ksh': 'bash', '.bats': 'bash',

        # Markdown
        '.md': 'markdown', '.markdown': 'markdown',

        # YAML
        '.yml': 'yaml', '.yaml': 'yaml',

        # Rust
        '.rs': 'rust', '.rlib': 'rust',

        # Swift
        '.swift': 'swift',

        # Kotlin
        '.kt': 'kotlin', '.kts': 'kotlin',

        # Dockerfile
        'dockerfile': 'dockerfile', # Handled by name check as well

        # SQL
        '.sql': 'sql',

        # Properties
        '.properties': 'properties',

        # TOML
        '.toml': 'toml',

        # Shell
        '.ps1': 'powershell',

        # Groovy
        '.groovy': 'groovy', '.gradle': 'groovy',
    }
    # Special check for Dockerfile (often no extension)
    if file_path.name.lower() == 'dockerfile':
        return 'dockerfile'

    return extension_to_language.get(file_path.suffix.lower(), '')

 def is_excluded(path: Path) -> bool:
    """
    Checks if a path should be excluded based on defined rules.

    Args:
        path (Path): The path to check (can be file or directory).

    Returns:
        bool: True if the path should be excluded, False otherwise.
    """
    # Check if any part of the path is an excluded directory name
    if any(part in EXCLUDED_DIRS for part in path.parts):
        return True

    # If it's a file, check against excluded file names and suffixes
    if path.is_file():
        if path.name in EXCLUDED_FILES:
            return True
        if path.suffix.lower() in EXCLUDED_SUFFIXES:
            return True

    return False

 def collect_text_files_content(directory: Path) -> str:
    """
    Collects the content of non-binary, UTF-8 encoded text files in the given directory,
    excluding common IDE, git, and environment files/folders.

    Args:
        directory (Path): The root directory to search for files.

    Returns:
        str: A formatted string containing file paths and their respective contents.
    """
    collected_content = []
    processed_files = set() # To handle potential symlink loops or duplicates

    # Use os.walk to efficiently skip directories
    for root, dirs, files in os.walk(directory, topdown=True):
        root_path = Path(root)

        # Filter out excluded directories *before* recursing into them
        dirs[:] = [d for d in dirs if not is_excluded(root_path / d)]

        for file_name in files:
            file_path = root_path / file_name

            if file_path in processed_files or is_excluded(file_path):
                continue

            processed_files.add(file_path)

            # Additional check: Is it a file (os.walk should ensure this, but be safe)
            if not file_path.is_file():
                continue

            # Try to guess mime type - skip obvious non-text, but rely on read attempt
            mime_type, _ = mimetypes.guess_type(file_path)
            if mime_type and not mime_type.startswith('text/') and \
               file_path.suffix.lower() not in ['.py', '.js', '.ts', '.go', '.rs', '.java', '.c', '.cpp', '.h', '.hpp', '.cs', '.rb', '.php', '.html', '.css', '.json', '.xml', '.sh', '.md', '.yml', '.yaml', '.swift', '.kt']:
                # print(f"Skipping probable binary file (mimetype): {file_path.relative_to(directory)}", file=sys.stderr)
                continue


            try:
                with file_path.open("r", encoding="utf-8") as file:
                    content = file.read()
                    language = determine_code_block_language(file_path)
                    code_block_header = f"```{language}" if language else "```"
                    relative_path = file_path.relative_to(directory)
                    formatted_entry = f"# {relative_path}\n{code_block_header}\n{content}\n```\n"
                    collected_content.append(formatted_entry)
            except (UnicodeDecodeError, IOError):
                # Skip binary or unreadable files silently, or print if needed
                # print(f"Skipping binary/unreadable file: {file_path.relative_to(directory)}", file=sys.stderr)
                continue
            except Exception as e:
                # Log unexpected errors
                print(f"Unexpected error reading {file_path.relative_to(directory)}: {e}", file=sys.stderr)

    return "\n".join(collected_content)

 def main():
    if len(sys.argv) != 2:
        print("Usage: python script.py <directory_path>")
        sys.exit(1)

    directory_arg = sys.argv[1]
    directory = Path(directory_arg).resolve() # Use absolute path

    if not directory.is_dir():
        print(f"Error: The path '{directory_arg}' is not a valid directory.")
        sys.exit(1)

    # Ensure the script doesn't process its own output file later
    output_filename = "code.md"
    EXCLUDED_FILES.add(output_filename)

    print(f"Scanning directory: {directory}")
    print(f"Excluding Dirs: {EXCLUDED_DIRS}")
    print(f"Excluding Files: {EXCLUDED_FILES}")
    print(f"Excluding Suffixes: {EXCLUDED_SUFFIXES}")


    content = collect_text_files_content(directory)

    try:
        # Place output in the *calling* directory, not the target directory
        output_path = Path.cwd() / output_filename
        output_path.write_text(content, encoding="utf-8")
        print(f"\nCollected source code written to {output_path}")
    except IOError as e:
        print(f"Failed to write to output file: {e}", file=sys.stderr)
        sys.exit(1)

 if __name__ == "__main__":
    main()
	import os
	import sys
	from pathlib import Path
	import mimetypes

	# Define sets of common folders and files to exclude for efficiency
	EXCLUDED_DIRS = {
	".git", # Git version control
	".vscode", # Visual Studio Code IDE
	".idea", # JetBrains IDEs (IntelliJ, PyCharm, etc.)
	"node_modules", # NodeJS dependencies
	"venv", # Python virtual environments
	"__pycache__", # Python bytecode cache
	"build", # Common build output directory
	"dist", # Common distribution output directory
	".svn", # Subversion version control
	"target", # Common build output (Java/Rust)
	"out", # Common build output
	"bin", # Common compiled binaries directory
	"obj", # Common compiled objects directory
	".gradle", # Gradle build system
	".mvn", # Maven build system
	}

	EXCLUDED_FILES = {
	".env", # Environment variables
	"code.md", # The output file itself
	Path(__file__).name, # The script file itself
	".DS_Store", # macOS specific file
	"Thumbs.db", # Windows specific file
	".gitignore", # Git ignore file
	".gitattributes", # Git attributes file
	".project", # Eclipse project file
	".classpath", # Eclipse classpath file
	".settings", # Eclipse settings directory (though handled by EXCLUDED_DIRS if a dir)
	"*.lock", # Dependency lock files (e.g., package-lock.json, poetry.lock) - handled by suffix check
	}

	EXCLUDED_SUFFIXES = {
	".iml", # JetBrains module file
	".log", # Log files
	".lock", # Lock files
	".swp", # Vim swap files
	".swo", # Vim swap files
	# Add common binary/media types if needed, though try/except handles most
	".exe", ".dll", ".so", ".a", ".lib", ".o",
	".jar", ".war", ".ear",
	".zip", ".tar", ".gz", ".rar", ".7z",
	".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".svg",
	".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
	".mp3", ".wav", ".mp4", ".mov", ".avi",
	}


	def determine_code_block_language(file_path: Path) -> str:
	"""
	Attempt to guess the programming language based on file extension.

	Args:
	file_path (Path): The path to the file.

	Returns:
	str: A language identifier for markdown code blocks (e.g., 'python', 'go').
	"""
	extension_to_language = {
	# Python
	'.py': 'python', '.pyw': 'python', '.pyi': 'python',

	# JavaScript
	'.js': 'javascript', '.mjs': 'javascript', '.cjs': 'javascript',

	# TypeScript
	'.ts': 'typescript', '.tsx': 'typescript',

	# Java
	'.java': 'java', '.jsp': 'java',

	# C
	'.c': 'c', '.h': 'c',

	# C++
	'.cpp': 'cpp', '.cxx': 'cpp', '.cc': 'cpp', '.hpp': 'cpp', '.hxx': 'cpp', '.hh': 'cpp',

	# C#
	'.cs': 'csharp', '.csx': 'csharp',

	# Go
	'.go': 'go',

	# Ruby
	'.rb': 'ruby', '.erb': 'ruby', '.rake': 'ruby',

	# PHP
	'.php': 'php', '.phtml': 'php',

	# HTML
	'.html': 'html', '.htm': 'html',

	# CSS
	'.css': 'css', '.scss': 'css', '.sass': 'css',

	# JSON
	'.json': 'json', '.geojson': 'json', '.jsonld': 'json',

	# XML
	'.xml': 'xml', '.xsl': 'xml', '.xsd': 'xml', '.rss': 'xml',

	# Shell/Bash
	'.sh': 'bash', '.bash': 'bash', '.zsh': 'bash', '.ksh': 'bash', '.bats': 'bash',

	# Markdown
	'.md': 'markdown', '.markdown': 'markdown',

	# YAML
	'.yml': 'yaml', '.yaml': 'yaml',

	# Rust
	'.rs': 'rust', '.rlib': 'rust',

	# Swift
	'.swift': 'swift',

	# Kotlin
	'.kt': 'kotlin', '.kts': 'kotlin',

	# Dockerfile
	'dockerfile': 'dockerfile', # Handled by name check as well

	# SQL
	'.sql': 'sql',

	# Properties
	'.properties': 'properties',

	# TOML
	'.toml': 'toml',

	# Shell
	'.ps1': 'powershell',

	# Groovy
	'.groovy': 'groovy', '.gradle': 'groovy',
	}
	# Special check for Dockerfile (often no extension)
	if file_path.name.lower() == 'dockerfile':
	return 'dockerfile'

	return extension_to_language.get(file_path.suffix.lower(), '')

	def is_excluded(path: Path) -> bool:
	"""
	Checks if a path should be excluded based on defined rules.

	Args:
	path (Path): The path to check (can be file or directory).

	Returns:
	bool: True if the path should be excluded, False otherwise.
	"""
	# Check if any part of the path is an excluded directory name
	if any(part in EXCLUDED_DIRS for part in path.parts):
	return True

	# If it's a file, check against excluded file names and suffixes
	if path.is_file():
	if path.name in EXCLUDED_FILES:
	return True
	if path.suffix.lower() in EXCLUDED_SUFFIXES:
	return True

	return False

	def collect_text_files_content(directory: Path) -> str:
	"""
	Collects the content of non-binary, UTF-8 encoded text files in the given directory,
	excluding common IDE, git, and environment files/folders.

	Args:
	directory (Path): The root directory to search for files.

	Returns:
	str: A formatted string containing file paths and their respective contents.
	"""
	collected_content = []
	processed_files = set() # To handle potential symlink loops or duplicates

	# Use os.walk to efficiently skip directories
	for root, dirs, files in os.walk(directory, topdown=True):
	root_path = Path(root)

	# Filter out excluded directories before recursing into them
	dirs[:] = [d for d in dirs if not is_excluded(root_path / d)]

	for file_name in files:
	file_path = root_path / file_name

	if file_path in processed_files or is_excluded(file_path):
	continue

	processed_files.add(file_path)

	# Additional check: Is it a file (os.walk should ensure this, but be safe)
	if not file_path.is_file():
	continue

	# Try to guess mime type - skip obvious non-text, but rely on read attempt
	mime_type, _ = mimetypes.guess_type(file_path)
	if mime_type and not mime_type.startswith('text/') and \
	file_path.suffix.lower() not in ['.py', '.js', '.ts', '.go', '.rs', '.java', '.c', '.cpp', '.h', '.hpp', '.cs', '.rb', '.php', '.html', '.css', '.json', '.xml', '.sh', '.md', '.yml', '.yaml', '.swift', '.kt']:
	# print(f"Skipping probable binary file (mimetype): {file_path.relative_to(directory)}", file=sys.stderr)
	continue


	try:
	with file_path.open("r", encoding="utf-8") as file:
	content = file.read()
	language = determine_code_block_language(file_path)
	code_block_header = f"```{language}" if language else "```"
	relative_path = file_path.relative_to(directory)
	formatted_entry = f"# {relative_path}\n{code_block_header}\n{content}\n```\n"
	collected_content.append(formatted_entry)
	except (UnicodeDecodeError, IOError):
	# Skip binary or unreadable files silently, or print if needed
	# print(f"Skipping binary/unreadable file: {file_path.relative_to(directory)}", file=sys.stderr)
	continue
	except Exception as e:
	# Log unexpected errors
	print(f"Unexpected error reading {file_path.relative_to(directory)}: {e}", file=sys.stderr)

	return "\n".join(collected_content)

	def main():
	if len(sys.argv) != 2:
	print("Usage: python script.py <directory_path>")
	sys.exit(1)

	directory_arg = sys.argv[1]
	directory = Path(directory_arg).resolve() # Use absolute path

	if not directory.is_dir():
	print(f"Error: The path '{directory_arg}' is not a valid directory.")
	sys.exit(1)

	# Ensure the script doesn't process its own output file later
	output_filename = "code.md"
	EXCLUDED_FILES.add(output_filename)

	print(f"Scanning directory: {directory}")
	print(f"Excluding Dirs: {EXCLUDED_DIRS}")
	print(f"Excluding Files: {EXCLUDED_FILES}")
	print(f"Excluding Suffixes: {EXCLUDED_SUFFIXES}")


	content = collect_text_files_content(directory)

	try:
	# Place output in the calling directory, not the target directory
	output_path = Path.cwd() / output_filename
	output_path.write_text(content, encoding="utf-8")
	print(f"\nCollected source code written to {output_path}")
	except IOError as e:
	print(f"Failed to write to output file: {e}", file=sys.stderr)
	sys.exit(1)

	if __name__ == "__main__":
	main()