Skip to content

Instantly share code, notes, and snippets.

@mshafiee
Last active May 30, 2025 08:40
Show Gist options
  • Save mshafiee/5520f1212131681a7b926dab96dc1156 to your computer and use it in GitHub Desktop.
Save mshafiee/5520f1212131681a7b926dab96dc1156 to your computer and use it in GitHub Desktop.
import os
import sys
from pathlib import Path
import mimetypes
# Define sets of common folders and files to exclude for efficiency
EXCLUDED_DIRS = {
".git", # Git version control
".vscode", # Visual Studio Code IDE
".idea", # JetBrains IDEs (IntelliJ, PyCharm, etc.)
"node_modules", # NodeJS dependencies
"venv", # Python virtual environments
"__pycache__", # Python bytecode cache
"build", # Common build output directory
"dist", # Common distribution output directory
".svn", # Subversion version control
"target", # Common build output (Java/Rust)
"out", # Common build output
"bin", # Common compiled binaries directory
"obj", # Common compiled objects directory
".gradle", # Gradle build system
".mvn", # Maven build system
}
EXCLUDED_FILES = {
".env", # Environment variables
"code.md", # The output file itself
Path(__file__).name, # The script file itself
".DS_Store", # macOS specific file
"Thumbs.db", # Windows specific file
".gitignore", # Git ignore file
".gitattributes", # Git attributes file
".project", # Eclipse project file
".classpath", # Eclipse classpath file
".settings", # Eclipse settings directory (though handled by EXCLUDED_DIRS if a dir)
"*.lock", # Dependency lock files (e.g., package-lock.json, poetry.lock) - handled by suffix check
}
EXCLUDED_SUFFIXES = {
".iml", # JetBrains module file
".log", # Log files
".lock", # Lock files
".swp", # Vim swap files
".swo", # Vim swap files
# Add common binary/media types if needed, though try/except handles most
".exe", ".dll", ".so", ".a", ".lib", ".o",
".jar", ".war", ".ear",
".zip", ".tar", ".gz", ".rar", ".7z",
".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".svg",
".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
".mp3", ".wav", ".mp4", ".mov", ".avi",
}
def determine_code_block_language(file_path: Path) -> str:
"""
Attempt to guess the programming language based on file extension.
Args:
file_path (Path): The path to the file.
Returns:
str: A language identifier for markdown code blocks (e.g., 'python', 'go').
"""
extension_to_language = {
# Python
'.py': 'python', '.pyw': 'python', '.pyi': 'python',
# JavaScript
'.js': 'javascript', '.mjs': 'javascript', '.cjs': 'javascript',
# TypeScript
'.ts': 'typescript', '.tsx': 'typescript',
# Java
'.java': 'java', '.jsp': 'java',
# C
'.c': 'c', '.h': 'c',
# C++
'.cpp': 'cpp', '.cxx': 'cpp', '.cc': 'cpp', '.hpp': 'cpp', '.hxx': 'cpp', '.hh': 'cpp',
# C#
'.cs': 'csharp', '.csx': 'csharp',
# Go
'.go': 'go',
# Ruby
'.rb': 'ruby', '.erb': 'ruby', '.rake': 'ruby',
# PHP
'.php': 'php', '.phtml': 'php',
# HTML
'.html': 'html', '.htm': 'html',
# CSS
'.css': 'css', '.scss': 'css', '.sass': 'css',
# JSON
'.json': 'json', '.geojson': 'json', '.jsonld': 'json',
# XML
'.xml': 'xml', '.xsl': 'xml', '.xsd': 'xml', '.rss': 'xml',
# Shell/Bash
'.sh': 'bash', '.bash': 'bash', '.zsh': 'bash', '.ksh': 'bash', '.bats': 'bash',
# Markdown
'.md': 'markdown', '.markdown': 'markdown',
# YAML
'.yml': 'yaml', '.yaml': 'yaml',
# Rust
'.rs': 'rust', '.rlib': 'rust',
# Swift
'.swift': 'swift',
# Kotlin
'.kt': 'kotlin', '.kts': 'kotlin',
# Dockerfile
'dockerfile': 'dockerfile', # Handled by name check as well
# SQL
'.sql': 'sql',
# Properties
'.properties': 'properties',
# TOML
'.toml': 'toml',
# Shell
'.ps1': 'powershell',
# Groovy
'.groovy': 'groovy', '.gradle': 'groovy',
}
# Special check for Dockerfile (often no extension)
if file_path.name.lower() == 'dockerfile':
return 'dockerfile'
return extension_to_language.get(file_path.suffix.lower(), '')
def is_excluded(path: Path) -> bool:
"""
Checks if a path should be excluded based on defined rules.
Args:
path (Path): The path to check (can be file or directory).
Returns:
bool: True if the path should be excluded, False otherwise.
"""
# Check if any part of the path is an excluded directory name
if any(part in EXCLUDED_DIRS for part in path.parts):
return True
# If it's a file, check against excluded file names and suffixes
if path.is_file():
if path.name in EXCLUDED_FILES:
return True
if path.suffix.lower() in EXCLUDED_SUFFIXES:
return True
return False
def collect_text_files_content(directory: Path) -> str:
"""
Collects the content of non-binary, UTF-8 encoded text files in the given directory,
excluding common IDE, git, and environment files/folders.
Args:
directory (Path): The root directory to search for files.
Returns:
str: A formatted string containing file paths and their respective contents.
"""
collected_content = []
processed_files = set() # To handle potential symlink loops or duplicates
# Use os.walk to efficiently skip directories
for root, dirs, files in os.walk(directory, topdown=True):
root_path = Path(root)
# Filter out excluded directories *before* recursing into them
dirs[:] = [d for d in dirs if not is_excluded(root_path / d)]
for file_name in files:
file_path = root_path / file_name
if file_path in processed_files or is_excluded(file_path):
continue
processed_files.add(file_path)
# Additional check: Is it a file (os.walk should ensure this, but be safe)
if not file_path.is_file():
continue
# Try to guess mime type - skip obvious non-text, but rely on read attempt
mime_type, _ = mimetypes.guess_type(file_path)
if mime_type and not mime_type.startswith('text/') and \
file_path.suffix.lower() not in ['.py', '.js', '.ts', '.go', '.rs', '.java', '.c', '.cpp', '.h', '.hpp', '.cs', '.rb', '.php', '.html', '.css', '.json', '.xml', '.sh', '.md', '.yml', '.yaml', '.swift', '.kt']:
# print(f"Skipping probable binary file (mimetype): {file_path.relative_to(directory)}", file=sys.stderr)
continue
try:
with file_path.open("r", encoding="utf-8") as file:
content = file.read()
language = determine_code_block_language(file_path)
code_block_header = f"```{language}" if language else "```"
relative_path = file_path.relative_to(directory)
formatted_entry = f"# {relative_path}\n{code_block_header}\n{content}\n```\n"
collected_content.append(formatted_entry)
except (UnicodeDecodeError, IOError):
# Skip binary or unreadable files silently, or print if needed
# print(f"Skipping binary/unreadable file: {file_path.relative_to(directory)}", file=sys.stderr)
continue
except Exception as e:
# Log unexpected errors
print(f"Unexpected error reading {file_path.relative_to(directory)}: {e}", file=sys.stderr)
return "\n".join(collected_content)
def main():
if len(sys.argv) != 2:
print("Usage: python script.py <directory_path>")
sys.exit(1)
directory_arg = sys.argv[1]
directory = Path(directory_arg).resolve() # Use absolute path
if not directory.is_dir():
print(f"Error: The path '{directory_arg}' is not a valid directory.")
sys.exit(1)
# Ensure the script doesn't process its own output file later
output_filename = "code.md"
EXCLUDED_FILES.add(output_filename)
print(f"Scanning directory: {directory}")
print(f"Excluding Dirs: {EXCLUDED_DIRS}")
print(f"Excluding Files: {EXCLUDED_FILES}")
print(f"Excluding Suffixes: {EXCLUDED_SUFFIXES}")
content = collect_text_files_content(directory)
try:
# Place output in the *calling* directory, not the target directory
output_path = Path.cwd() / output_filename
output_path.write_text(content, encoding="utf-8")
print(f"\nCollected source code written to {output_path}")
except IOError as e:
print(f"Failed to write to output file: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment