Last active
February 7, 2024 03:21
-
-
Save henryiii/432868baf4a69432b6d1c0220592af3d to your computer and use it in GitHub Desktop.
Download all pyproject.tomls
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import contextlib | |
import sqlite3 | |
import tomllib | |
from collections import Counter | |
def main(): | |
counter = Counter() | |
with contextlib.closing(sqlite3.connect("pyproject_contents.db")) as con: | |
cursor = con.cursor() | |
for row in cursor.execute("SELECT contents FROM pyproject"): | |
contents, = row | |
try: | |
toml = tomllib.loads(contents) | |
backend = toml.get("build-system", {}).get("build-backend", "unknown") | |
if isinstance(backend, str): | |
counter[backend] += 1 | |
else: | |
counter["busted"] += 1 | |
except tomllib.TOMLDecodeError: | |
counter["broken"] += 1 | |
for i, (k, v) in enumerate(counter.most_common()): | |
print(f"{i:3} {k}: {v}") | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import pickle | |
from collections import Counter | |
from collections.abc import Generator | |
from pathlib import Path | |
from typing import Any | |
def dig(value: Any, key: str, *keys: str) -> Any: | |
res = value.get(key, {}) | |
return dig(res, *keys) if keys else res | |
def all_keys( | |
d: dict[str, Any], level: int, *prefixes: str | |
) -> Generator[str, None, None]: | |
for key, value in d.items(): | |
if isinstance(value, dict) and level > 0: | |
yield from all_keys(value, level - 1, *prefixes, key) | |
else: | |
yield ".".join([*prefixes, key]) | |
def get_tomls_cached(db: str) -> Generator[tuple[str, str, dict[str, Any]], None, None]: | |
pkl = Path(f"{db}.pkl") | |
with pkl.open("rb") as f: | |
yield from pickle.load(f) | |
def main(tool: str, get_contents: bool, level: int = 0) -> None: | |
if tool: | |
if get_contents: | |
print(f"{tool} contents:") | |
else: | |
print(tool + ".*" * (level + 1) + ":") | |
else: | |
if get_contents: | |
raise AssertionError("Can't get contents with no section") | |
print("*:") | |
if get_contents and level > 0: | |
raise AssertionError("Can't use level with contents") | |
counter = Counter() | |
for _, _, toml in get_tomls_cached("pyproject_contents.db"): | |
item = dig(toml, *tool.split(".")) if tool else toml | |
if item: | |
if get_contents: | |
counter[repr(item)] += 1 | |
else: | |
counter += Counter(all_keys(item, level=level)) | |
for k, v in counter.most_common(): | |
print(f"{k}: {v}") | |
def blame(tool: str, string: str) -> None: | |
if string: | |
print(tool, "=", string) | |
else: | |
print(tool, "= ...") | |
for name, version, toml in get_tomls_cached("pyproject_contents.db"): | |
item = dig(toml, *tool.split(".")) if tool else toml | |
if not string and item: | |
print(name, version, "=", repr(item)) | |
elif repr(item) == string: | |
print(name, version) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("tool", help="Tool to processes") | |
parser.add_argument("-c", "--contents", action="store_true") | |
parser.add_argument( | |
"-l", "--level", type=int, default=0, help="Unpack nested levels" | |
) | |
parser.add_argument( | |
"-b", | |
"--blame", | |
help="print matching project names, empty string to print any value (careful)", | |
) | |
args = parser.parse_args() | |
if args.blame is not None: | |
assert args.level == 0 | |
assert not args.contents | |
blame(args.tool, args.blame) | |
else: | |
main(args.tool, args.contents, args.level) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import contextlib | |
import sqlite3 | |
import tomllib | |
from collections import Counter | |
def main(): | |
counter = Counter() | |
with contextlib.closing(sqlite3.connect("pyproject_contents.db")) as con: | |
cursor = con.cursor() | |
for row in cursor.execute("SELECT contents FROM pyproject"): | |
contents, = row | |
with contextlib.suppress(tomllib.TOMLDecodeError): | |
toml = tomllib.loads(contents) | |
tools = toml.get("tool", {}).keys() | |
counter += Counter(f"tool.{k}" for k in tools) | |
for i, (k, v) in enumerate(counter.most_common()): | |
print(f"{i:3} {k}: {v}") | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import contextlib | |
import pickle | |
import sqlite3 | |
from collections.abc import Generator | |
from pathlib import Path | |
from typing import Any | |
import tomllib | |
def get_tomls(db: str) -> Generator[tuple[str, str, dict[str, Any]], None, None]: | |
with contextlib.closing(sqlite3.connect(db)) as con: | |
cursor = con.cursor() | |
for row in cursor.execute( | |
"SELECT project_name, project_version, contents FROM pyproject" | |
): | |
project_name, project_version, contents = row | |
with contextlib.suppress(tomllib.TOMLDecodeError): | |
yield project_name, project_version, tomllib.loads(contents) | |
def make_cache(db: str) -> None: | |
pkl = Path(f"{db}.pkl") | |
with pkl.open("wb", pickle.HIGHEST_PROTOCOL) as f: | |
pickle.dump(list(get_tomls(db)), f) | |
if __name__ == "__main__": | |
make_cache("pyproject_contents.db") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
MIT License | |
Copyright (c) 2023 Françoise CONIL | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# dependencies = ["aiohttp", "packaging"] | |
# requires-python = ">=3.11" | |
# /// | |
""" | |
Downloads all pyproject.toml files and puts them in a database. Doesn't talk to | |
GitHub if the package name + version already is in the database. | |
You need an input CSV to work on. To prepare one, you can use | |
pyproject-latest-to-csv.py. | |
""" | |
import asyncio | |
import contextlib | |
import csv | |
import itertools | |
import logging | |
import sqlite3 | |
import sys | |
import time | |
from collections.abc import Iterator | |
import aiohttp | |
from packaging.version import Version | |
LOG = logging.getLogger(__name__) | |
PYPROJECT_CREATE = """CREATE TABLE IF NOT EXISTS | |
pyproject(project_name TEXT PRIMARY KEY, project_version TEXT, contents TEXT) | |
""" | |
INSERT_CONTENTS = """INSERT INTO pyproject | |
VALUES (:project_name, :project_version, :contents) | |
""" | |
GET_CONTENTS = """SELECT project_version FROM pyproject WHERE project_name=? | |
""" | |
DELETE_CONTENTS = """DELETE FROM pyproject WHERE project_name=? | |
""" | |
csv.field_size_limit(sys.maxsize) | |
async def get_data( | |
session: aiohttp.ClientSession, path: str, repo: int, name: str | |
) -> str | None: | |
if path.count("/") != 4 or not path.endswith("/pyproject.toml"): | |
LOG.warning("Project %s has non-top-level path %s", name, path) | |
return None | |
url = f"https://raw.githubusercontent.com/pypi-data/pypi-mirror-{repo}/code/{path}" | |
try: | |
async with session.get(url) as response: | |
if response.status == 429: | |
LOG.error("Rate limited when accessing %s", name) | |
raise RuntimeError("Rate limited") | |
if response.status != 200: | |
LOG.error("pycodeorg.get_data failed to retrieve %s", name) | |
return None | |
try: | |
data = await response.text() | |
except UnicodeDecodeError: | |
LOG.exception("Unicode decode error on %s", name) | |
return None | |
return data | |
except ( | |
aiohttp.http_exceptions.BadHttpMessage, | |
aiohttp.client_exceptions.ClientResponseError, | |
): | |
LOG.exception("Failed reading %s", name) | |
return None | |
async def worker( | |
iterator: Iterator[str], session: aiohttp.ClientSession, cursor: sqlite3.Cursor, thread: int | |
) -> None: | |
with contextlib.suppress(StopIteration): | |
for i in itertools.count(0): | |
if i and i % 200 == 0: | |
LOG.info("PROGRESS %d: %d", thread, i) | |
line = next(iterator) | |
with cursor.connection: | |
result = cursor.execute(GET_CONTENTS, (line["project_name"],)) | |
value = result.fetchone() | |
if value and Version(line["project_version"]) <= Version(value[0]): | |
continue | |
data = await get_data( | |
session, line["path"], line["repository"], line["project_name"] | |
) | |
if not data: | |
continue | |
with cursor.connection: | |
cursor.execute(DELETE_CONTENTS, (line["project_name"],)) | |
cursor.execute( | |
INSERT_CONTENTS, | |
{ | |
"project_name": line["project_name"], | |
"project_version": line["project_version"], | |
"contents": data, | |
}, | |
) | |
async def main() -> None: | |
with contextlib.closing(sqlite3.connect("pyproject_contents.db")) as cnx_backend: | |
cur_backend = cnx_backend.cursor() | |
cur_backend.execute(PYPROJECT_CREATE) | |
with open("extract-pyproject-all-versions.csv", newline="") as f: | |
reader = csv.DictReader(f) | |
total = len(list(reader)) | |
print(f"Processing {total} projects") | |
with open("extract-pyproject-latest.csv", newline="") as f: | |
reader = csv.DictReader(f) | |
iterator = iter(reversed(list(reader))) | |
async with aiohttp.ClientSession() as session, asyncio.TaskGroup() as tg: | |
for i in range(8): | |
tg.create_task(worker(iterator, session, cur_backend, i)) | |
if __name__ == "__main__": | |
start_time = time.time() | |
logging.basicConfig(filename="pyproject_contents.log", level=logging.INFO) | |
asyncio.run(main()) | |
end_time = time.time() | |
duration_msg = f"Getting files took : {end_time - start_time:0.3} seconds." | |
LOG.info(duration_msg) | |
print(duration_msg) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# dependencies = ["duckdb"] | |
# /// | |
""" | |
Originally from https://framapiaf.org/@fcodvpt/111540079686191842 | |
https://gitlab.liris.cnrs.fr/fconil-small-programs/packaging/get-pypi-packages-backends | |
https://sethmlarson.dev/security-developer-in-residence-weekly-report-18 | |
https://gist.github.com/sethmlarson/852341a9b7899eda7d22d8c362c0a095 | |
curl -L --remote-name-all $(curl -L "https://github.com/pypi-data/data/raw/main/links/dataset.txt") | |
MIT licensed. | |
""" | |
import duckdb | |
ALL_VERSIONS_QUERY = """SELECT project_name, COUNT(project_name) AS nb_uploads, | |
MAX(project_version) AS max_version, | |
LIST(DISTINCT project_version) AS all_versions, | |
MAX(uploaded_on) AS max_uploaded_on, | |
LIST(DISTINCT uploaded_on) AS all_uploaded_on, | |
LIST(DISTINCT repository) AS all_repository, | |
LIST(DISTINCT path) AS all_path | |
FROM '*.parquet' | |
WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == '' | |
GROUP BY project_name; | |
""" | |
res = duckdb.sql(ALL_VERSIONS_QUERY) | |
res.to_csv("extract-pyproject-all-versions.csv", header=True) | |
LATEST_QUERY = """WITH lpv AS (SELECT project_name, COUNT(project_name) AS nb_uploads, | |
MAX(uploaded_on) AS max_uploaded_on, | |
LIST(DISTINCT uploaded_on) AS all_uploaded_on | |
FROM '*.parquet' | |
WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == '' | |
GROUP BY project_name) | |
SELECT ip.repository, ip.project_name, ip.project_version, lpv.nb_uploads, | |
ip.uploaded_on, date_part('year', ip.uploaded_on) AS year, ip.path | |
FROM '*.parquet' as ip | |
JOIN lpv ON ip.project_name=lpv.project_name AND ip.uploaded_on=lpv.max_uploaded_on | |
WHERE regexp_matches(path, 'pyproject.toml$') AND skip_reason == ''; | |
""" | |
# res = duckdb.sql(LATEST_QUERY).show() | |
res = duckdb.sql(LATEST_QUERY) | |
res.to_csv("extract-pyproject-latest.csv", header=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment