Last active
April 14, 2022 09:00
-
-
Save spezold/3b00ea130aa4db0540eb9a82719ef8d5 to your computer and use it in GitHub Desktop.
Find all modules that are imported by the given project, list the code files (*.py, *.ipynb) that use them, and try to distinguish between STL and non-STL modules.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
CAUTION: Make sure that | |
1. this file is placed in the root directory of the project of interest | |
(or otherwise, adjust `BASE_DIR` accordingly); | |
2. the file is run in the same Python environment (conda environment, poetry environment, ...) | |
as the project of interest (so activate the corresponding environment first, if necessary). | |
""" | |
from collections import defaultdict | |
from importlib.util import find_spec | |
import json | |
from pathlib import Path | |
import re | |
import subprocess | |
import sys | |
from typing import Dict, List, Optional | |
QUERY_IMPORT = r"^\s*import\s+(\w+)" # import ... | |
QUERY_FROM = r"^\s*from\s+(\w+).*import" # from ... import ... | |
def all_py_files_for(base_dir: Path) -> List[Path]: | |
return sorted(f for f in base_dir.glob("**/*.py") if f.resolve() != Path(__file__).resolve()) # Exclude this file | |
def all_ipynb_files_for(base_dir: Path) -> List[Path]: | |
return sorted(f for f in base_dir.glob("**/*.ipynb")) | |
def all_code_lines_for_py_file_at(p: Path) -> List[str]: | |
return p.read_text(encoding="utf-8").split("\n") | |
def all_code_lines_for_ipynb_file_at(p: Path) -> List[str]: | |
code_cells = [c for c in json.loads(p.read_text(encoding="utf-8"))["cells"] if c["cell_type"] == "code"] | |
return [ln for c in code_cells for ln in c["source"]] | |
def all_modules_in(code_lines: List[str]) -> List[str]: | |
return [m.group(1) for m in (re.search(q, ln) for ln in code_lines for q in [QUERY_IMPORT, QUERY_FROM]) if m] | |
def module_by_file_in(base_dir: Path) -> Dict[str, List[Path]]: # key: module name, value: list of importing file paths | |
module_by_file = defaultdict(list) | |
for source_path in all_py_files_for(base_dir): | |
for module in all_modules_in(all_code_lines_for_py_file_at(source_path)): | |
module_by_file[module].append(source_path) | |
for source_path in all_ipynb_files_for(base_dir): | |
for module in all_modules_in(all_code_lines_for_ipynb_file_at(source_path)): | |
module_by_file[module].append(source_path) | |
# Return as regular dict with sorted keys, sorted paths, and removed duplicates | |
return {k: sorted(set(v)) for k, v in sorted(module_by_file.items())} | |
def is_part_of_stl(module_name: str, base_dir: Optional[Path]) -> bool: # CAUTION: heuristic only for Python < 3.10 | |
try: | |
stdlib_module_names = getattr(sys, "stdlib_module_names") # This is present in Python >= 3.10 only | |
return module_name in stdlib_module_names | |
except AttributeError: | |
if module_name in sys.builtin_module_names: # For these we can still be sure, everything else is guessing | |
return True | |
module_spec = find_spec(module_name) | |
if not module_spec: # Should only happen for modules that are imported but not installed (so cannot be STL) | |
return False | |
origin = module_spec.origin | |
if not origin: # Seem like modules from current code base can have origin None | |
return False | |
if origin in ["built-in", "builtin"]: # Do we ever get there? | |
return True | |
origin_path = Path(origin).resolve() # Anyway, from now on, all `origin`s should be file paths | |
assert origin_path.is_file() | |
origin_dir = origin_path.parent | |
if base_dir is not None and base_dir.resolve() in origin_dir.parents: # Exclude modules from current code base | |
return False | |
return all(p not in origin_dir.parts for p in ["site-packages", "dist-packages"]) # Exclude installed modules | |
if __name__ == "__main__": | |
BASE_DIR = Path(__file__).parent # TODO: This could be a command line argument | |
print(f"Using {BASE_DIR} as base directory.") | |
try: | |
git_command = "git describe --all" | |
git_state = subprocess.check_output(git_command, cwd=BASE_DIR).strip().decode() | |
print(f"Git state ('${git_command}'): {git_state}") | |
except (FileNotFoundError, subprocess.CalledProcessError): | |
pass # Either git is not installed, or not in a git repo | |
print() | |
m_by_f = module_by_file_in(BASE_DIR) | |
for key, val in m_by_f.items(): | |
print(f"Module '{key}' used in:") | |
print("\n".join(f"- {p}" for p in val)) | |
print("\nPart of STL:") | |
print("\n".join(m for m in m_by_f.keys() if is_part_of_stl(m, base_dir=BASE_DIR))) | |
print("\nNot part of STL:") | |
print("\n".join(m for m in m_by_f.keys() if not is_part_of_stl(m, base_dir=BASE_DIR))) | |
if "importlib" in m_by_f.keys(): | |
print(f"\nCAUTION: The following source files use 'importlib' and thus should be manually checked for " | |
"additional imports:") | |
print("\n".join((f"- {p}" for p in m_by_f["importlib"]))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Known issues
ast
module (i.e. Python's own tooling) instead. The answer to this stackoverflow question can serve as the basis.sys.stdlib_module_names
. For versions prior to 3.10, I implemented a heuristic that seems to work fully reliably in my projects, but I may have overlooked some special cases. Have a look and adapt theis_part_of_stl()
function, if something looks wrong in this respect.