Last active
October 8, 2023 13:14
-
-
Save glenn-jocher/f0452b55364e6a83d24e80efd96c5b94 to your computer and use it in GitHub Desktop.
Analyze Python Files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Python Files Analyzer | |
This script analyzes Python files in a given GitHub repo, excluding specified sub-directories. | |
It counts and reports the following for each file: | |
- Total number of characters | |
- Total number of words | |
- Total number of lines | |
- Total number of functions (based on the 'def' keyword) | |
- Total number of classes (based on the 'class' keyword) | |
Results are printed to the console. Additionally, histograms for characters, words, and lines | |
are plotted and saved as a high-resolution PNG image. | |
Usage: | |
Run the script in a directory containing Python files. By default, it will analyze all Python files | |
in the current directory and its sub-directories, excluding any paths matching patterns in the | |
'exclude_patterns' list (e.g., "/venv" and "/runs"). | |
Required Libraries: | |
- collections | |
- pathlib | |
- matplotlib | |
- numpy | |
- tqdm | |
- re | |
""" | |
import re | |
import subprocess | |
from collections import defaultdict | |
from pathlib import Path | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from tqdm import tqdm | |
def clone_github_repo(repo_url: str, target_dir: Path = Path("repos")) -> Path: | |
""" | |
Clone a GitHub repository to a target directory. | |
Args: | |
repo_url (str): URL of the GitHub repository. | |
target_dir (Path): Directory where the repo should be cloned. | |
Returns: | |
Path: Path to the cloned repository. | |
""" | |
if not target_dir.exists(): | |
target_dir.mkdir(parents=True) | |
# Extract repo name from the URL to use as a folder name | |
repo_name = repo_url.rstrip('/').split('/')[-1].replace('.git', '') | |
repo_path = target_dir / repo_name | |
if not repo_path.exists(): | |
print(f"Cloning {repo_url} into {repo_path}...") | |
subprocess.run(["git", "clone", repo_url, str(repo_path)], check=True) | |
else: | |
print(f"{repo_path} already exists. Skipping clone and using existing data.") | |
return repo_path | |
def analyze_python_files(directory_path: Path, exclude_patterns: list = []) -> dict: | |
""" | |
Analyze Python files in the directory: count characters, words, lines, functions, and classes. | |
Args: | |
directory_path (Path): Directory to analyze. | |
exclude_patterns (list): List of directory patterns to exclude. | |
Returns: | |
dict: Results with file paths as keys and stats as values. | |
""" | |
func_pattern = re.compile(r'^\s*def\s+\w+\s*\(', re.MULTILINE) | |
class_pattern = re.compile(r'^\s*class\s+\w+', re.MULTILINE) | |
valid_files = [f for f in directory_path.rglob('*.py') if | |
not any(pattern in str(f) for pattern in exclude_patterns)] | |
results = defaultdict(tuple) | |
for py_file in tqdm(valid_files, desc="Analyzing"): | |
with py_file.open(encoding='utf-8', errors='replace') as f: | |
content = f.read() | |
chars = len(content) | |
words = len(content.split()) | |
lines = len(content.split('\n')) | |
functions = len(func_pattern.findall(content)) | |
classes = len(class_pattern.findall(content)) | |
results[py_file] = (chars, words, lines, functions, classes) | |
return results | |
def plot_histogram(data: dict) -> None: | |
""" | |
Plot histograms for characters, words, and lines on a single graph with stats. | |
Args: | |
data (dict): Dictionary with filenames as keys and a tuple (chars, words, lines) as values. | |
""" | |
metrics = ['chars', 'words', 'lines'] | |
titles = ['Characters', 'Words', 'Lines'] | |
# Create a figure and a 1x3 grid of subplots | |
fig, axes = plt.subplots(1, 3, figsize=(18, 6)) | |
for idx, metric in enumerate(metrics): | |
if metric == 'chars': | |
values = [v[0] for v in data.values()] | |
elif metric == 'words': | |
values = [v[1] for v in data.values()] | |
elif metric == 'lines': | |
values = [v[2] for v in data.values()] | |
min_val, mean_val, max_val = np.min(values), np.mean(values), np.max(values) | |
axes[idx].hist(values, bins=30, edgecolor='black') | |
axes[idx].set_yscale('log') | |
axes[idx].set_title(f"{titles[idx]}\nMin: {min_val}, Mean: {round(mean_val, 2)}, Max: {max_val}") | |
axes[idx].set_xlabel(metric) | |
axes[idx].set_ylabel('Number of files (log scale)') | |
plt.tight_layout() | |
plt.savefig('python_files_statistics.png', dpi=300) | |
plt.show() | |
if __name__ == '__main__': | |
repo_url = 'https://github.com/ultralytics/ultralytics' | |
# Clone the GitHub repo into 'repos/' directory | |
repo_path = clone_github_repo(repo_url) | |
print(f'Analyzing Python files in {repo_path}') | |
results = analyze_python_files(repo_path, ["/venv", "/runs"]) | |
total_files = len(results) | |
total_lines = sum(v[2] for v in results.values()) | |
total_chars = sum(v[0] for v in results.values()) | |
total_words = sum(v[1] for v in results.values()) | |
total_functions = sum(v[3] for v in results.values()) | |
total_classes = sum(v[4] for v in results.values()) | |
print(f"Files: {total_files}, Lines: {total_lines}, Chars: {total_chars}, Words: {total_words} " | |
f"Functions: {total_functions}, Classes: {total_classes}") | |
plot_histogram(results) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment