Last active
May 30, 2025 08:40
-
-
Save mshafiee/5520f1212131681a7b926dab96dc1156 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
from pathlib import Path | |
import mimetypes | |
# Define sets of common folders and files to exclude for efficiency | |
EXCLUDED_DIRS = { | |
".git", # Git version control | |
".vscode", # Visual Studio Code IDE | |
".idea", # JetBrains IDEs (IntelliJ, PyCharm, etc.) | |
"node_modules", # NodeJS dependencies | |
"venv", # Python virtual environments | |
"__pycache__", # Python bytecode cache | |
"build", # Common build output directory | |
"dist", # Common distribution output directory | |
".svn", # Subversion version control | |
"target", # Common build output (Java/Rust) | |
"out", # Common build output | |
"bin", # Common compiled binaries directory | |
"obj", # Common compiled objects directory | |
".gradle", # Gradle build system | |
".mvn", # Maven build system | |
} | |
EXCLUDED_FILES = { | |
".env", # Environment variables | |
"code.md", # The output file itself | |
Path(__file__).name, # The script file itself | |
".DS_Store", # macOS specific file | |
"Thumbs.db", # Windows specific file | |
".gitignore", # Git ignore file | |
".gitattributes", # Git attributes file | |
".project", # Eclipse project file | |
".classpath", # Eclipse classpath file | |
".settings", # Eclipse settings directory (though handled by EXCLUDED_DIRS if a dir) | |
"*.lock", # Dependency lock files (e.g., package-lock.json, poetry.lock) - handled by suffix check | |
} | |
EXCLUDED_SUFFIXES = { | |
".iml", # JetBrains module file | |
".log", # Log files | |
".lock", # Lock files | |
".swp", # Vim swap files | |
".swo", # Vim swap files | |
# Add common binary/media types if needed, though try/except handles most | |
".exe", ".dll", ".so", ".a", ".lib", ".o", | |
".jar", ".war", ".ear", | |
".zip", ".tar", ".gz", ".rar", ".7z", | |
".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".svg", | |
".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx", | |
".mp3", ".wav", ".mp4", ".mov", ".avi", | |
} | |
def determine_code_block_language(file_path: Path) -> str: | |
""" | |
Attempt to guess the programming language based on file extension. | |
Args: | |
file_path (Path): The path to the file. | |
Returns: | |
str: A language identifier for markdown code blocks (e.g., 'python', 'go'). | |
""" | |
extension_to_language = { | |
# Python | |
'.py': 'python', '.pyw': 'python', '.pyi': 'python', | |
# JavaScript | |
'.js': 'javascript', '.mjs': 'javascript', '.cjs': 'javascript', | |
# TypeScript | |
'.ts': 'typescript', '.tsx': 'typescript', | |
# Java | |
'.java': 'java', '.jsp': 'java', | |
# C | |
'.c': 'c', '.h': 'c', | |
# C++ | |
'.cpp': 'cpp', '.cxx': 'cpp', '.cc': 'cpp', '.hpp': 'cpp', '.hxx': 'cpp', '.hh': 'cpp', | |
# C# | |
'.cs': 'csharp', '.csx': 'csharp', | |
# Go | |
'.go': 'go', | |
# Ruby | |
'.rb': 'ruby', '.erb': 'ruby', '.rake': 'ruby', | |
# PHP | |
'.php': 'php', '.phtml': 'php', | |
# HTML | |
'.html': 'html', '.htm': 'html', | |
# CSS | |
'.css': 'css', '.scss': 'css', '.sass': 'css', | |
# JSON | |
'.json': 'json', '.geojson': 'json', '.jsonld': 'json', | |
# XML | |
'.xml': 'xml', '.xsl': 'xml', '.xsd': 'xml', '.rss': 'xml', | |
# Shell/Bash | |
'.sh': 'bash', '.bash': 'bash', '.zsh': 'bash', '.ksh': 'bash', '.bats': 'bash', | |
# Markdown | |
'.md': 'markdown', '.markdown': 'markdown', | |
# YAML | |
'.yml': 'yaml', '.yaml': 'yaml', | |
# Rust | |
'.rs': 'rust', '.rlib': 'rust', | |
# Swift | |
'.swift': 'swift', | |
# Kotlin | |
'.kt': 'kotlin', '.kts': 'kotlin', | |
# Dockerfile | |
'dockerfile': 'dockerfile', # Handled by name check as well | |
# SQL | |
'.sql': 'sql', | |
# Properties | |
'.properties': 'properties', | |
# TOML | |
'.toml': 'toml', | |
# Shell | |
'.ps1': 'powershell', | |
# Groovy | |
'.groovy': 'groovy', '.gradle': 'groovy', | |
} | |
# Special check for Dockerfile (often no extension) | |
if file_path.name.lower() == 'dockerfile': | |
return 'dockerfile' | |
return extension_to_language.get(file_path.suffix.lower(), '') | |
def is_excluded(path: Path) -> bool: | |
""" | |
Checks if a path should be excluded based on defined rules. | |
Args: | |
path (Path): The path to check (can be file or directory). | |
Returns: | |
bool: True if the path should be excluded, False otherwise. | |
""" | |
# Check if any part of the path is an excluded directory name | |
if any(part in EXCLUDED_DIRS for part in path.parts): | |
return True | |
# If it's a file, check against excluded file names and suffixes | |
if path.is_file(): | |
if path.name in EXCLUDED_FILES: | |
return True | |
if path.suffix.lower() in EXCLUDED_SUFFIXES: | |
return True | |
return False | |
def collect_text_files_content(directory: Path) -> str: | |
""" | |
Collects the content of non-binary, UTF-8 encoded text files in the given directory, | |
excluding common IDE, git, and environment files/folders. | |
Args: | |
directory (Path): The root directory to search for files. | |
Returns: | |
str: A formatted string containing file paths and their respective contents. | |
""" | |
collected_content = [] | |
processed_files = set() # To handle potential symlink loops or duplicates | |
# Use os.walk to efficiently skip directories | |
for root, dirs, files in os.walk(directory, topdown=True): | |
root_path = Path(root) | |
# Filter out excluded directories *before* recursing into them | |
dirs[:] = [d for d in dirs if not is_excluded(root_path / d)] | |
for file_name in files: | |
file_path = root_path / file_name | |
if file_path in processed_files or is_excluded(file_path): | |
continue | |
processed_files.add(file_path) | |
# Additional check: Is it a file (os.walk should ensure this, but be safe) | |
if not file_path.is_file(): | |
continue | |
# Try to guess mime type - skip obvious non-text, but rely on read attempt | |
mime_type, _ = mimetypes.guess_type(file_path) | |
if mime_type and not mime_type.startswith('text/') and \ | |
file_path.suffix.lower() not in ['.py', '.js', '.ts', '.go', '.rs', '.java', '.c', '.cpp', '.h', '.hpp', '.cs', '.rb', '.php', '.html', '.css', '.json', '.xml', '.sh', '.md', '.yml', '.yaml', '.swift', '.kt']: | |
# print(f"Skipping probable binary file (mimetype): {file_path.relative_to(directory)}", file=sys.stderr) | |
continue | |
try: | |
with file_path.open("r", encoding="utf-8") as file: | |
content = file.read() | |
language = determine_code_block_language(file_path) | |
code_block_header = f"```{language}" if language else "```" | |
relative_path = file_path.relative_to(directory) | |
formatted_entry = f"# {relative_path}\n{code_block_header}\n{content}\n```\n" | |
collected_content.append(formatted_entry) | |
except (UnicodeDecodeError, IOError): | |
# Skip binary or unreadable files silently, or print if needed | |
# print(f"Skipping binary/unreadable file: {file_path.relative_to(directory)}", file=sys.stderr) | |
continue | |
except Exception as e: | |
# Log unexpected errors | |
print(f"Unexpected error reading {file_path.relative_to(directory)}: {e}", file=sys.stderr) | |
return "\n".join(collected_content) | |
def main(): | |
if len(sys.argv) != 2: | |
print("Usage: python script.py <directory_path>") | |
sys.exit(1) | |
directory_arg = sys.argv[1] | |
directory = Path(directory_arg).resolve() # Use absolute path | |
if not directory.is_dir(): | |
print(f"Error: The path '{directory_arg}' is not a valid directory.") | |
sys.exit(1) | |
# Ensure the script doesn't process its own output file later | |
output_filename = "code.md" | |
EXCLUDED_FILES.add(output_filename) | |
print(f"Scanning directory: {directory}") | |
print(f"Excluding Dirs: {EXCLUDED_DIRS}") | |
print(f"Excluding Files: {EXCLUDED_FILES}") | |
print(f"Excluding Suffixes: {EXCLUDED_SUFFIXES}") | |
content = collect_text_files_content(directory) | |
try: | |
# Place output in the *calling* directory, not the target directory | |
output_path = Path.cwd() / output_filename | |
output_path.write_text(content, encoding="utf-8") | |
print(f"\nCollected source code written to {output_path}") | |
except IOError as e: | |
print(f"Failed to write to output file: {e}", file=sys.stderr) | |
sys.exit(1) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment