Last active
April 17, 2024 22:22
-
-
Save RohanAwhad/cd71f939710d73f16fc9e20f9dd90ba3 to your computer and use it in GitHub Desktop.
A utility script to download and process files from a GitHub repository based on language-specific criteria.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Inspiration from Eric Hartford github2file: https://github.com/cognitivecomputations/github2file | |
""" | |
A utility script to download and process files from a GitHub repository based on language-specific criteria. | |
Features: | |
- Download files from a specific branch or tag of a GitHub repository. | |
- Filter files based on their extension to include only those relevant to the specified programming language. | |
- Exclude files located in certain directories or those that match a set of predefined non-useful criteria. | |
- Check and exclude test files based on content indicators specific to each supported language. | |
- Optionally remove comments and Python docstrings to focus on the executable code. | |
- Save the cleaned and filtered code to a single output file with annotations indicating the original file paths. | |
Usage: | |
The script is run from the command line with arguments specifying the repository URL, the target programming language, | |
and other optional parameters like whether to keep comments and which branch or tag to download. | |
Supported languages include Python, Go, JavaScript, TypeScript, Java, C, C++, and several others, each with tailored | |
handling for file types and content considerations pertinent to those languages. | |
Example: | |
```bash | |
python github2file.py https://github.com/example/repo --lang python --keep-comments --branch_or_tag main | |
``` | |
This will download Python files from the 'main' branch of the repository, keeping comments and docstrings, and save | |
them to an output file named after the repository and language. | |
""" | |
import argparse | |
import sys | |
import requests | |
import zipfile | |
import io | |
import ast | |
from typing import List | |
# ============================================================================= | |
# CONSTANTS | |
# ============================================================================= | |
LANGUAGE_EXTENSIONS = { | |
"python": [".py", ".pyw"], | |
"go": [".go"], | |
"md": [".md"], # Add .md extension for Markdown files | |
"js": [".js", ".ts", ".jsx", ".tsx"], # Add JavaScript and TypeScript extensions | |
"java": [".java"], | |
"c": [".c"], | |
"cpp": [".cpp", ".cc", ".h", ".hpp"], | |
"csharp": [".cs"], | |
"ruby": [".rb"], | |
"php": [".php"], | |
"swift": [".swift"], | |
"kotlin": [".kt"], | |
"rust": [".rs"], | |
"scala": [".scala"], | |
"r": [".r"], | |
} | |
def get_language_extensions(language: str) -> List[str]: return LANGUAGE_EXTENSIONS[language.lower()] | |
def is_file_type(file_path: str, language: str) -> bool: | |
"""Check if the file has the specified file extension.""" | |
return any(file_path.endswith(extension) for extension in get_language_extensions(language)) | |
def is_likely_useful_file(file_path, lang): | |
"""Determine if the file is likely to be useful by excluding certain directories and specific file types.""" | |
excluded_dirs = ["examples", "tests", "test", "scripts", "utils", "benchmarks", ".vscode"] | |
utility_or_config_files = [] | |
github_workflow_or_docs = [".github", ".gitignore", "LICENSE"] | |
if lang == "python": | |
excluded_dirs.append("__pycache__") | |
utility_or_config_files.extend(["hubconf.py", "setup.py"]) | |
github_workflow_or_docs.extend(["stale.py", "gen-card-", "write_model_card"]) | |
elif lang == "go": | |
excluded_dirs.append("vendor") | |
utility_or_config_files.extend(["go.mod", "go.sum", "Makefile"]) | |
if any(part.startswith('.') for part in file_path.split('/')): | |
return False | |
if 'test' in file_path.lower(): | |
return False | |
for excluded_dir in excluded_dirs: | |
if f"/{excluded_dir}/" in file_path or file_path.startswith(excluded_dir + "/"): | |
return False | |
for file_name in utility_or_config_files: | |
if file_name in file_path: | |
return False | |
for doc_file in github_workflow_or_docs: | |
if doc_file in file_path: | |
return False | |
return True | |
def is_test_file(file_content, lang): | |
"""Determine if the file content suggests it is a test file.""" | |
test_indicators = [] | |
if lang == "python": | |
test_indicators = ["import unittest", "import pytest", "from unittest", "from pytest"] | |
elif lang == "go": | |
test_indicators = ["import testing", "func Test"] | |
return any(indicator in file_content for indicator in test_indicators) | |
def has_sufficient_content(file_content, min_line_count=10): | |
"""Check if the file has a minimum number of substantive lines.""" | |
lines = [ | |
line for line in file_content.split('\n') | |
if line.strip() and not line.strip().startswith(('#', '//'))] # Exclude empty and comment lines | |
return len(lines) >= min_line_count | |
def remove_comments_and_docstrings(source): | |
"""Remove comments and docstrings from the Python source code.""" | |
tree = ast.parse(source) | |
for node in ast.walk(tree): | |
if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)) and ast.get_docstring(node): | |
node.body = node.body[1:] # Remove docstring | |
elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Str): | |
node.value.s = "" # Remove comments | |
return ast.unparse(tree) | |
def download_repo(repo_url, output_file, lang, keep_comments=False, branch_or_tag="master"): | |
"""Download and process files from a GitHub repository.""" | |
download_url = f"{repo_url}/archive/refs/heads/{branch_or_tag}.zip" | |
print(download_url) | |
response = requests.get(download_url) | |
if response.status_code == 200: | |
zip_file = zipfile.ZipFile(io.BytesIO(response.content)) | |
with open(output_file, "w", encoding="utf-8") as outfile: | |
for file_path in zip_file.namelist(): | |
# Skip directories, non-language files, less likely useful files, hidden directories, and test files | |
if file_path.endswith("/") or not is_file_type(file_path, lang) or not is_likely_useful_file(file_path, lang): | |
continue | |
file_content = zip_file.read(file_path).decode("utf-8") | |
# Skip test files based on content and files with insufficient substantive content | |
if is_test_file(file_content, lang) or not has_sufficient_content(file_content): | |
continue | |
if lang == "python" and not keep_comments: | |
try: | |
file_content = remove_comments_and_docstrings(file_content) | |
except SyntaxError: | |
# Skip files with syntax errors | |
continue | |
outfile.write(f"// File: {file_path}\n" if lang == "go" else f"# File: {file_path}\n") | |
outfile.write(file_content) | |
outfile.write("\n\n") | |
else: | |
print(f"Failed to download the repository. Status code: {response.status_code}") | |
sys.exit(1) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description='Download and process files from a GitHub repository.') | |
parser.add_argument('repo_url', type=str, help='The URL of the GitHub repository') | |
parser.add_argument('--lang', type=str, choices=list(LANGUAGE_EXTENSIONS.keys()), default='python', help='The programming language of the repository') | |
parser.add_argument('--keep-comments', action='store_true', help='Keep comments and docstrings in the source code (only applicable for Python)') | |
parser.add_argument('--branch_or_tag', type=str, help='The branch or tag of the repository to download', default="master") | |
args = parser.parse_args() | |
output_file = f"{args.repo_url.split('/')[-1]}_{args.lang}.txt" | |
download_repo(args.repo_url, output_file, args.lang, args.keep_comments, args.branch_or_tag) | |
print(f"Combined {args.lang.capitalize()} source code saved to {output_file}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment