henryiii · February 7, 2024 03:21
diff --git a/compute-backends.py b/compute-backends.py
 import contextlib
 import sqlite3
 import tomllib
 from collections import Counter


 def main():
    counter = Counter()
    with contextlib.closing(sqlite3.connect("pyproject_contents.db")) as con:
        cursor = con.cursor()
        for row in cursor.execute("SELECT contents FROM pyproject"):
            contents, = row
            try:
                toml = tomllib.loads(contents)
                backend = toml.get("build-system", {}).get("build-backend", "unknown")
                if isinstance(backend, str):
                    counter[backend] += 1
                else:
                    counter["busted"] += 1
            except tomllib.TOMLDecodeError:
                counter["broken"] += 1

    for i, (k, v) in enumerate(counter.most_common()):
        print(f"{i:3} {k}: {v}")

 if __name__ == "__main__":
    main()
diff --git a/compute-tool.py b/compute-tool.py
 #!/usr/bin/env python3

 import argparse
 import pickle
 from collections import Counter
 from collections.abc import Generator
 from pathlib import Path
 from typing import Any


 def dig(value: Any, key: str, *keys: str) -> Any:
    res = value.get(key, {})
    return dig(res, *keys) if keys else res


 def all_keys(
    d: dict[str, Any], level: int, *prefixes: str
 ) -> Generator[str, None, None]:
    for key, value in d.items():
        if isinstance(value, dict) and level > 0:
            yield from all_keys(value, level - 1, *prefixes, key)
        else:
            yield ".".join([*prefixes, key])


 def get_tomls_cached(db: str) -> Generator[tuple[str, str, dict[str, Any]], None, None]:
    pkl = Path(f"{db}.pkl")
    with pkl.open("rb") as f:
        yield from pickle.load(f)


 def main(tool: str, get_contents: bool, level: int = 0) -> None:
    if tool:
        if get_contents:
            print(f"{tool} contents:")
        else:
            print(tool + ".*" * (level + 1) + ":")
    else:
        if get_contents:
            raise AssertionError("Can't get contents with no section")

        print("*:")

    if get_contents and level > 0:
        raise AssertionError("Can't use level with contents")

    counter = Counter()
    for _, _, toml in get_tomls_cached("pyproject_contents.db"):
        item = dig(toml, *tool.split(".")) if tool else toml
        if item:
            if get_contents:
                counter[repr(item)] += 1
            else:
                counter += Counter(all_keys(item, level=level))

    for k, v in counter.most_common():
        print(f"{k}: {v}")


 def blame(tool: str, string: str) -> None:
    if string:
        print(tool, "=", string)
    else:
        print(tool, "= ...")
    for name, version, toml in get_tomls_cached("pyproject_contents.db"):
        item = dig(toml, *tool.split(".")) if tool else toml
        if not string and item:
            print(name, version, "=", repr(item))
        elif repr(item) == string:
            print(name, version)


 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("tool", help="Tool to processes")
    parser.add_argument("-c", "--contents", action="store_true")
    parser.add_argument(
        "-l", "--level", type=int, default=0, help="Unpack nested levels"
    )
    parser.add_argument(
        "-b",
        "--blame",
        help="print matching project names, empty string to print any value (careful)",
    )
    args = parser.parse_args()
    if args.blame is not None:
        assert args.level == 0
        assert not args.contents
        blame(args.tool, args.blame)
    else:
        main(args.tool, args.contents, args.level)
diff --git a/compute-tools.py b/compute-tools.py
 #!/usr/bin/env python

 import contextlib
 import sqlite3
 import tomllib
 from collections import Counter


 def main():
    counter = Counter()
    with contextlib.closing(sqlite3.connect("pyproject_contents.db")) as con:
        cursor = con.cursor()
        for row in cursor.execute("SELECT contents FROM pyproject"):
            contents, = row
            with contextlib.suppress(tomllib.TOMLDecodeError):
                toml = tomllib.loads(contents)
                tools = toml.get("tool", {}).keys()
                counter += Counter(f"tool.{k}" for k in tools)

    for i, (k, v) in enumerate(counter.most_common()):
        print(f"{i:3} {k}: {v}")

 if __name__ == "__main__":
    main()
diff --git a/db_to_pickle.py b/db_to_pickle.py
 #!/usr/bin/env python3

 import contextlib
 import pickle
 import sqlite3
 from collections.abc import Generator
 from pathlib import Path
 from typing import Any

 import tomllib


 def get_tomls(db: str) -> Generator[tuple[str, str, dict[str, Any]], None, None]:
    with contextlib.closing(sqlite3.connect(db)) as con:
        cursor = con.cursor()
        for row in cursor.execute(
            "SELECT project_name, project_version, contents FROM pyproject"
        ):
            project_name, project_version, contents = row
            with contextlib.suppress(tomllib.TOMLDecodeError):
                yield project_name, project_version, tomllib.loads(contents)


 def make_cache(db: str) -> None:
    pkl = Path(f"{db}.pkl")
    with pkl.open("wb", pickle.HIGHEST_PROTOCOL) as f:
        pickle.dump(list(get_tomls(db)), f)


 if __name__ == "__main__":
    make_cache("pyproject_contents.db")
diff --git a/LICENSE b/LICENSE
 MIT License

 Copyright (c) 2023 Françoise CONIL

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.

diff --git a/pyproject-download.py b/pyproject-download.py
 # /// script
 # dependencies = ["aiohttp", "packaging"]
 # requires-python = ">=3.11"
 # ///

 """
 Downloads all pyproject.toml files and puts them in a database. Doesn't talk to
 GitHub if the package name + version already is in the database.

 You need an input CSV to work on. To prepare one, you can use
 pyproject-latest-to-csv.py.
 """

 import asyncio
 import contextlib
 import csv
 import itertools
 import logging
 import sqlite3
 import sys
 import time
 from collections.abc import Iterator

 import aiohttp
 from packaging.version import Version

 LOG = logging.getLogger(__name__)

 PYPROJECT_CREATE = """CREATE TABLE IF NOT EXISTS
 pyproject(project_name TEXT PRIMARY KEY, project_version TEXT, contents TEXT)
 """

 INSERT_CONTENTS = """INSERT INTO pyproject
 VALUES (:project_name, :project_version, :contents)
 """

 GET_CONTENTS = """SELECT project_version FROM pyproject WHERE project_name=?
 """
 DELETE_CONTENTS = """DELETE FROM pyproject WHERE project_name=?
 """

 csv.field_size_limit(sys.maxsize)


 async def get_data(
    session: aiohttp.ClientSession, path: str, repo: int, name: str
 ) -> str | None:
    if path.count("/") != 4 or not path.endswith("/pyproject.toml"):
        LOG.warning("Project %s has non-top-level path %s", name, path)
        return None

    url = f"https://raw.githubusercontent.com/pypi-data/pypi-mirror-{repo}/code/{path}"
    try:
        async with session.get(url) as response:
            if response.status == 429:
                LOG.error("Rate limited when accessing %s", name)
                raise RuntimeError("Rate limited")
            if response.status != 200:
                LOG.error("pycodeorg.get_data failed to retrieve %s", name)
                return None
            try:
                data = await response.text()
            except UnicodeDecodeError:
                LOG.exception("Unicode decode error on %s", name)
                return None
            return data
    except (
        aiohttp.http_exceptions.BadHttpMessage,
        aiohttp.client_exceptions.ClientResponseError,
    ):
        LOG.exception("Failed reading %s", name)
        return None


 async def worker(
    iterator: Iterator[str], session: aiohttp.ClientSession, cursor: sqlite3.Cursor, thread: int
 ) -> None:
    with contextlib.suppress(StopIteration):
        for i in itertools.count(0):
            if i and i % 200 == 0:
                LOG.info("PROGRESS %d: %d", thread, i)
            line = next(iterator)
            with cursor.connection:
                result = cursor.execute(GET_CONTENTS, (line["project_name"],))
                value = result.fetchone()
                if value and Version(line["project_version"]) <= Version(value[0]):
                    continue

            data = await get_data(
                session, line["path"], line["repository"], line["project_name"]
            )
            if not data:
                continue

            with cursor.connection:
                cursor.execute(DELETE_CONTENTS, (line["project_name"],))
                cursor.execute(
                    INSERT_CONTENTS,
                    {
                        "project_name": line["project_name"],
                        "project_version": line["project_version"],
                        "contents": data,
                    },
                )


 async def main() -> None:
    with contextlib.closing(sqlite3.connect("pyproject_contents.db")) as cnx_backend:
        cur_backend = cnx_backend.cursor()
        cur_backend.execute(PYPROJECT_CREATE)

        with open("extract-pyproject-all-versions.csv", newline="") as f:
            reader = csv.DictReader(f)
            total = len(list(reader))

        print(f"Processing {total} projects")

        with open("extract-pyproject-latest.csv", newline="") as f:
            reader = csv.DictReader(f)
            iterator = iter(reversed(list(reader)))
            async with aiohttp.ClientSession() as session, asyncio.TaskGroup() as tg:
                for i in range(8):
                    tg.create_task(worker(iterator, session, cur_backend, i))


 if __name__ == "__main__":
    start_time = time.time()
    logging.basicConfig(filename="pyproject_contents.log", level=logging.INFO)

    asyncio.run(main())

    end_time = time.time()
    duration_msg = f"Getting files took : {end_time - start_time:0.3} seconds."

    LOG.info(duration_msg)
    print(duration_msg)
diff --git a/pyproject-latest-to-csv.py b/pyproject-latest-to-csv.py
 # /// script
 # dependencies = ["duckdb"]
 # ///

 """

 Originally from https://framapiaf.org/@fcodvpt/111540079686191842
 https://gitlab.liris.cnrs.fr/fconil-small-programs/packaging/get-pypi-packages-backends
 https://sethmlarson.dev/security-developer-in-residence-weekly-report-18
 https://gist.github.com/sethmlarson/852341a9b7899eda7d22d8c362c0a095

    curl -L --remote-name-all $(curl -L "https://github.com/pypi-data/data/raw/main/links/dataset.txt")

 MIT licensed.
 """

 import duckdb

 ALL_VERSIONS_QUERY = """SELECT project_name, COUNT(project_name) AS nb_uploads,
  MAX(project_version) AS max_version, 
  LIST(DISTINCT project_version) AS all_versions,
  MAX(uploaded_on) AS max_uploaded_on, 
  LIST(DISTINCT uploaded_on) AS all_uploaded_on,
  LIST(DISTINCT repository) AS all_repository,
  LIST(DISTINCT path) AS all_path
  FROM '*.parquet'
  WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == ''
  GROUP BY project_name;
 """

 res = duckdb.sql(ALL_VERSIONS_QUERY)
 res.to_csv("extract-pyproject-all-versions.csv", header=True)

 LATEST_QUERY = """WITH lpv AS (SELECT project_name, COUNT(project_name) AS nb_uploads,
  MAX(uploaded_on) AS max_uploaded_on, 
  LIST(DISTINCT uploaded_on) AS all_uploaded_on
  FROM '*.parquet'
  WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == ''
  GROUP BY project_name)
 SELECT ip.repository, ip.project_name, ip.project_version, lpv.nb_uploads, 
  ip.uploaded_on, date_part('year', ip.uploaded_on) AS year, ip.path
  FROM '*.parquet' as ip
    JOIN lpv ON ip.project_name=lpv.project_name AND ip.uploaded_on=lpv.max_uploaded_on
  WHERE regexp_matches(path, 'pyproject.toml$') AND skip_reason == '';
 """

 # res = duckdb.sql(LATEST_QUERY).show()

 res = duckdb.sql(LATEST_QUERY)
 res.to_csv("extract-pyproject-latest.csv", header=True)
	import contextlib
	import sqlite3
	import tomllib
	from collections import Counter


	def main():
	counter = Counter()
	with contextlib.closing(sqlite3.connect("pyproject_contents.db")) as con:
	cursor = con.cursor()
	for row in cursor.execute("SELECT contents FROM pyproject"):
	contents, = row
	try:
	toml = tomllib.loads(contents)
	backend = toml.get("build-system", {}).get("build-backend", "unknown")
	if isinstance(backend, str):
	counter[backend] += 1
	else:
	counter["busted"] += 1
	except tomllib.TOMLDecodeError:
	counter["broken"] += 1

	for i, (k, v) in enumerate(counter.most_common()):
	print(f"{i:3} {k}: {v}")

	if __name__ == "__main__":
	main()
	#!/usr/bin/env python3

	import argparse
	import pickle
	from collections import Counter
	from collections.abc import Generator
	from pathlib import Path
	from typing import Any


	def dig(value: Any, key: str, *keys: str) -> Any:
	res = value.get(key, {})
	return dig(res, *keys) if keys else res


	def all_keys(
	d: dict[str, Any], level: int, *prefixes: str
	) -> Generator[str, None, None]:
	for key, value in d.items():
	if isinstance(value, dict) and level > 0:
	yield from all_keys(value, level - 1, *prefixes, key)
	else:
	yield ".".join([*prefixes, key])


	def get_tomls_cached(db: str) -> Generator[tuple[str, str, dict[str, Any]], None, None]:
	pkl = Path(f"{db}.pkl")
	with pkl.open("rb") as f:
	yield from pickle.load(f)


	def main(tool: str, get_contents: bool, level: int = 0) -> None:
	if tool:
	if get_contents:
	print(f"{tool} contents:")
	else:
	print(tool + "." (level + 1) + ":")
	else:
	if get_contents:
	raise AssertionError("Can't get contents with no section")

	print("*:")

	if get_contents and level > 0:
	raise AssertionError("Can't use level with contents")

	counter = Counter()
	for _, _, toml in get_tomls_cached("pyproject_contents.db"):
	item = dig(toml, *tool.split(".")) if tool else toml
	if item:
	if get_contents:
	counter[repr(item)] += 1
	else:
	counter += Counter(all_keys(item, level=level))

	for k, v in counter.most_common():
	print(f"{k}: {v}")


	def blame(tool: str, string: str) -> None:
	if string:
	print(tool, "=", string)
	else:
	print(tool, "= ...")
	for name, version, toml in get_tomls_cached("pyproject_contents.db"):
	item = dig(toml, *tool.split(".")) if tool else toml
	if not string and item:
	print(name, version, "=", repr(item))
	elif repr(item) == string:
	print(name, version)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("tool", help="Tool to processes")
	parser.add_argument("-c", "--contents", action="store_true")
	parser.add_argument(
	"-l", "--level", type=int, default=0, help="Unpack nested levels"
	)
	parser.add_argument(
	"-b",
	"--blame",
	help="print matching project names, empty string to print any value (careful)",
	)
	args = parser.parse_args()
	if args.blame is not None:
	assert args.level == 0
	assert not args.contents
	blame(args.tool, args.blame)
	else:
	main(args.tool, args.contents, args.level)
	#!/usr/bin/env python

	import contextlib
	import sqlite3
	import tomllib
	from collections import Counter


	def main():
	counter = Counter()
	with contextlib.closing(sqlite3.connect("pyproject_contents.db")) as con:
	cursor = con.cursor()
	for row in cursor.execute("SELECT contents FROM pyproject"):
	contents, = row
	with contextlib.suppress(tomllib.TOMLDecodeError):
	toml = tomllib.loads(contents)
	tools = toml.get("tool", {}).keys()
	counter += Counter(f"tool.{k}" for k in tools)

	for i, (k, v) in enumerate(counter.most_common()):
	print(f"{i:3} {k}: {v}")

	if __name__ == "__main__":
	main()
	MIT License

	Copyright (c) 2023 Françoise CONIL

	Permission is hereby granted, free of charge, to any person obtaining a copy
	of this software and associated documentation files (the "Software"), to deal
	in the Software without restriction, including without limitation the rights
	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	copies of the Software, and to permit persons to whom the Software is
	furnished to do so, subject to the following conditions:

	The above copyright notice and this permission notice shall be included in all
	copies or substantial portions of the Software.

	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	SOFTWARE.
	# /// script
	# dependencies = ["aiohttp", "packaging"]
	# requires-python = ">=3.11"
	# ///

	"""
	Downloads all pyproject.toml files and puts them in a database. Doesn't talk to
	GitHub if the package name + version already is in the database.

	You need an input CSV to work on. To prepare one, you can use
	pyproject-latest-to-csv.py.
	"""

	import asyncio
	import contextlib
	import csv
	import itertools
	import logging
	import sqlite3
	import sys
	import time
	from collections.abc import Iterator

	import aiohttp
	from packaging.version import Version

	LOG = logging.getLogger(__name__)

	PYPROJECT_CREATE = """CREATE TABLE IF NOT EXISTS
	pyproject(project_name TEXT PRIMARY KEY, project_version TEXT, contents TEXT)
	"""

	INSERT_CONTENTS = """INSERT INTO pyproject
	VALUES (:project_name, :project_version, :contents)
	"""

	GET_CONTENTS = """SELECT project_version FROM pyproject WHERE project_name=?
	"""
	DELETE_CONTENTS = """DELETE FROM pyproject WHERE project_name=?
	"""

	csv.field_size_limit(sys.maxsize)


	async def get_data(
	session: aiohttp.ClientSession, path: str, repo: int, name: str
	) -> str \| None:
	if path.count("/") != 4 or not path.endswith("/pyproject.toml"):
	LOG.warning("Project %s has non-top-level path %s", name, path)
	return None

	url = f"https://raw.githubusercontent.com/pypi-data/pypi-mirror-{repo}/code/{path}"
	try:
	async with session.get(url) as response:
	if response.status == 429:
	LOG.error("Rate limited when accessing %s", name)
	raise RuntimeError("Rate limited")
	if response.status != 200:
	LOG.error("pycodeorg.get_data failed to retrieve %s", name)
	return None
	try:
	data = await response.text()
	except UnicodeDecodeError:
	LOG.exception("Unicode decode error on %s", name)
	return None
	return data
	except (
	aiohttp.http_exceptions.BadHttpMessage,
	aiohttp.client_exceptions.ClientResponseError,
	):
	LOG.exception("Failed reading %s", name)
	return None


	async def worker(
	iterator: Iterator[str], session: aiohttp.ClientSession, cursor: sqlite3.Cursor, thread: int
	) -> None:
	with contextlib.suppress(StopIteration):
	for i in itertools.count(0):
	if i and i % 200 == 0:
	LOG.info("PROGRESS %d: %d", thread, i)
	line = next(iterator)
	with cursor.connection:
	result = cursor.execute(GET_CONTENTS, (line["project_name"],))
	value = result.fetchone()
	if value and Version(line["project_version"]) <= Version(value[0]):
	continue

	data = await get_data(
	session, line["path"], line["repository"], line["project_name"]
	)
	if not data:
	continue

	with cursor.connection:
	cursor.execute(DELETE_CONTENTS, (line["project_name"],))
	cursor.execute(
	INSERT_CONTENTS,
	{
	"project_name": line["project_name"],
	"project_version": line["project_version"],
	"contents": data,
	},
	)


	async def main() -> None:
	with contextlib.closing(sqlite3.connect("pyproject_contents.db")) as cnx_backend:
	cur_backend = cnx_backend.cursor()
	cur_backend.execute(PYPROJECT_CREATE)

	with open("extract-pyproject-all-versions.csv", newline="") as f:
	reader = csv.DictReader(f)
	total = len(list(reader))

	print(f"Processing {total} projects")

	with open("extract-pyproject-latest.csv", newline="") as f:
	reader = csv.DictReader(f)
	iterator = iter(reversed(list(reader)))
	async with aiohttp.ClientSession() as session, asyncio.TaskGroup() as tg:
	for i in range(8):
	tg.create_task(worker(iterator, session, cur_backend, i))


	if __name__ == "__main__":
	start_time = time.time()
	logging.basicConfig(filename="pyproject_contents.log", level=logging.INFO)

	asyncio.run(main())

	end_time = time.time()
	duration_msg = f"Getting files took : {end_time - start_time:0.3} seconds."

	LOG.info(duration_msg)
	print(duration_msg)
	# /// script
	# dependencies = ["duckdb"]
	# ///

	"""

	Originally from https://framapiaf.org/@fcodvpt/111540079686191842
	https://gitlab.liris.cnrs.fr/fconil-small-programs/packaging/get-pypi-packages-backends
	https://sethmlarson.dev/security-developer-in-residence-weekly-report-18
	https://gist.github.com/sethmlarson/852341a9b7899eda7d22d8c362c0a095

	curl -L --remote-name-all $(curl -L "https://github.com/pypi-data/data/raw/main/links/dataset.txt")

	MIT licensed.
	"""

	import duckdb

	ALL_VERSIONS_QUERY = """SELECT project_name, COUNT(project_name) AS nb_uploads,
	MAX(project_version) AS max_version,
	LIST(DISTINCT project_version) AS all_versions,
	MAX(uploaded_on) AS max_uploaded_on,
	LIST(DISTINCT uploaded_on) AS all_uploaded_on,
	LIST(DISTINCT repository) AS all_repository,
	LIST(DISTINCT path) AS all_path
	FROM '*.parquet'
	WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == ''
	GROUP BY project_name;
	"""

	res = duckdb.sql(ALL_VERSIONS_QUERY)
	res.to_csv("extract-pyproject-all-versions.csv", header=True)

	LATEST_QUERY = """WITH lpv AS (SELECT project_name, COUNT(project_name) AS nb_uploads,
	MAX(uploaded_on) AS max_uploaded_on,
	LIST(DISTINCT uploaded_on) AS all_uploaded_on
	FROM '*.parquet'
	WHERE (date_part('year', uploaded_on) >= '2018') AND regexp_matches(path, 'pyproject.toml$') AND skip_reason == ''
	GROUP BY project_name)
	SELECT ip.repository, ip.project_name, ip.project_version, lpv.nb_uploads,
	ip.uploaded_on, date_part('year', ip.uploaded_on) AS year, ip.path
	FROM '*.parquet' as ip
	JOIN lpv ON ip.project_name=lpv.project_name AND ip.uploaded_on=lpv.max_uploaded_on
	WHERE regexp_matches(path, 'pyproject.toml$') AND skip_reason == '';
	"""

	# res = duckdb.sql(LATEST_QUERY).show()

	res = duckdb.sql(LATEST_QUERY)
	res.to_csv("extract-pyproject-latest.csv", header=True)