Skip to content

Instantly share code, notes, and snippets.

@RohanAwhad
Last active April 17, 2024 22:22
Show Gist options
  • Save RohanAwhad/cd71f939710d73f16fc9e20f9dd90ba3 to your computer and use it in GitHub Desktop.
Save RohanAwhad/cd71f939710d73f16fc9e20f9dd90ba3 to your computer and use it in GitHub Desktop.
A utility script to download and process files from a GitHub repository based on language-specific criteria.
# Inspiration from Eric Hartford github2file: https://github.com/cognitivecomputations/github2file
"""
A utility script to download and process files from a GitHub repository based on language-specific criteria.
Features:
- Download files from a specific branch or tag of a GitHub repository.
- Filter files based on their extension to include only those relevant to the specified programming language.
- Exclude files located in certain directories or those that match a set of predefined non-useful criteria.
- Check and exclude test files based on content indicators specific to each supported language.
- Optionally remove comments and Python docstrings to focus on the executable code.
- Save the cleaned and filtered code to a single output file with annotations indicating the original file paths.
Usage:
The script is run from the command line with arguments specifying the repository URL, the target programming language,
and other optional parameters like whether to keep comments and which branch or tag to download.
Supported languages include Python, Go, JavaScript, TypeScript, Java, C, C++, and several others, each with tailored
handling for file types and content considerations pertinent to those languages.
Example:
```bash
python github2file.py https://github.com/example/repo --lang python --keep-comments --branch_or_tag main
```
This will download Python files from the 'main' branch of the repository, keeping comments and docstrings, and save
them to an output file named after the repository and language.
"""
import argparse
import sys
import requests
import zipfile
import io
import ast
from typing import List
# =============================================================================
# CONSTANTS
# =============================================================================
LANGUAGE_EXTENSIONS = {
"python": [".py", ".pyw"],
"go": [".go"],
"md": [".md"], # Add .md extension for Markdown files
"js": [".js", ".ts", ".jsx", ".tsx"], # Add JavaScript and TypeScript extensions
"java": [".java"],
"c": [".c"],
"cpp": [".cpp", ".cc", ".h", ".hpp"],
"csharp": [".cs"],
"ruby": [".rb"],
"php": [".php"],
"swift": [".swift"],
"kotlin": [".kt"],
"rust": [".rs"],
"scala": [".scala"],
"r": [".r"],
}
def get_language_extensions(language: str) -> List[str]: return LANGUAGE_EXTENSIONS[language.lower()]
def is_file_type(file_path: str, language: str) -> bool:
"""Check if the file has the specified file extension."""
return any(file_path.endswith(extension) for extension in get_language_extensions(language))
def is_likely_useful_file(file_path, lang):
"""Determine if the file is likely to be useful by excluding certain directories and specific file types."""
excluded_dirs = ["examples", "tests", "test", "scripts", "utils", "benchmarks", ".vscode"]
utility_or_config_files = []
github_workflow_or_docs = [".github", ".gitignore", "LICENSE"]
if lang == "python":
excluded_dirs.append("__pycache__")
utility_or_config_files.extend(["hubconf.py", "setup.py"])
github_workflow_or_docs.extend(["stale.py", "gen-card-", "write_model_card"])
elif lang == "go":
excluded_dirs.append("vendor")
utility_or_config_files.extend(["go.mod", "go.sum", "Makefile"])
if any(part.startswith('.') for part in file_path.split('/')):
return False
if 'test' in file_path.lower():
return False
for excluded_dir in excluded_dirs:
if f"/{excluded_dir}/" in file_path or file_path.startswith(excluded_dir + "/"):
return False
for file_name in utility_or_config_files:
if file_name in file_path:
return False
for doc_file in github_workflow_or_docs:
if doc_file in file_path:
return False
return True
def is_test_file(file_content, lang):
"""Determine if the file content suggests it is a test file."""
test_indicators = []
if lang == "python":
test_indicators = ["import unittest", "import pytest", "from unittest", "from pytest"]
elif lang == "go":
test_indicators = ["import testing", "func Test"]
return any(indicator in file_content for indicator in test_indicators)
def has_sufficient_content(file_content, min_line_count=10):
"""Check if the file has a minimum number of substantive lines."""
lines = [
line for line in file_content.split('\n')
if line.strip() and not line.strip().startswith(('#', '//'))] # Exclude empty and comment lines
return len(lines) >= min_line_count
def remove_comments_and_docstrings(source):
"""Remove comments and docstrings from the Python source code."""
tree = ast.parse(source)
for node in ast.walk(tree):
if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)) and ast.get_docstring(node):
node.body = node.body[1:] # Remove docstring
elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Str):
node.value.s = "" # Remove comments
return ast.unparse(tree)
def download_repo(repo_url, output_file, lang, keep_comments=False, branch_or_tag="master"):
"""Download and process files from a GitHub repository."""
download_url = f"{repo_url}/archive/refs/heads/{branch_or_tag}.zip"
print(download_url)
response = requests.get(download_url)
if response.status_code == 200:
zip_file = zipfile.ZipFile(io.BytesIO(response.content))
with open(output_file, "w", encoding="utf-8") as outfile:
for file_path in zip_file.namelist():
# Skip directories, non-language files, less likely useful files, hidden directories, and test files
if file_path.endswith("/") or not is_file_type(file_path, lang) or not is_likely_useful_file(file_path, lang):
continue
file_content = zip_file.read(file_path).decode("utf-8")
# Skip test files based on content and files with insufficient substantive content
if is_test_file(file_content, lang) or not has_sufficient_content(file_content):
continue
if lang == "python" and not keep_comments:
try:
file_content = remove_comments_and_docstrings(file_content)
except SyntaxError:
# Skip files with syntax errors
continue
outfile.write(f"// File: {file_path}\n" if lang == "go" else f"# File: {file_path}\n")
outfile.write(file_content)
outfile.write("\n\n")
else:
print(f"Failed to download the repository. Status code: {response.status_code}")
sys.exit(1)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Download and process files from a GitHub repository.')
parser.add_argument('repo_url', type=str, help='The URL of the GitHub repository')
parser.add_argument('--lang', type=str, choices=list(LANGUAGE_EXTENSIONS.keys()), default='python', help='The programming language of the repository')
parser.add_argument('--keep-comments', action='store_true', help='Keep comments and docstrings in the source code (only applicable for Python)')
parser.add_argument('--branch_or_tag', type=str, help='The branch or tag of the repository to download', default="master")
args = parser.parse_args()
output_file = f"{args.repo_url.split('/')[-1]}_{args.lang}.txt"
download_repo(args.repo_url, output_file, args.lang, args.keep_comments, args.branch_or_tag)
print(f"Combined {args.lang.capitalize()} source code saved to {output_file}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment