Created
August 20, 2021 04:38
-
-
Save e2thenegpii/c4b8d89d6866f419504ecafcdd20eb4f to your computer and use it in GitHub Desktop.
combine packaging and aiohttp to discover valid distributions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
from contextlib import closing | |
from typing import Iterator, Dict, Set, AsyncIterator, Iterable, Mapping, Container | |
from urllib.parse import urlparse, urlunparse | |
from collections import defaultdict | |
from pprint import pprint | |
import aiohttp | |
from packaging.requirements import Requirement | |
from packaging.utils import parse_wheel_filename, parse_sdist_filename | |
from lxml import etree | |
from yarl import URL | |
import pdb | |
async def get_anchor_tags(response: aiohttp.ClientResponse) -> AsyncIterator[etree.Element]: | |
with closing(etree.HTMLPullParser()) as parser: | |
async for data in response.content.iter_any(): | |
#pdb.set_trace() | |
parser.feed(data) | |
for _, element in parser.read_events(): | |
if element.tag == 'a': | |
yield element | |
async def get_candidate_urls(response: aiohttp.ClientResponse) -> AsyncIterator[URL]: | |
async for element in get_anchor_tags(response): | |
href = element.attrib.get('href', None) | |
if href: | |
url = URL(href) | |
if not url.is_absolute(): | |
url = response.url.with_path(href) | |
yield url | |
async def filter_by_requirement(requirement: Requirement, response: aiohttp.ClientResponse) -> AsyncIterator[URL]: | |
async for url in get_candidate_urls(response): | |
if url.name.endswith('.whl'): | |
name, version, build, tags = parse_wheel_filename(url.name) | |
elif url.name.endswith('.tar.gz') or url.name.endswith('.zip'): | |
name, version = parse_sdist_filename(url.name) | |
else: | |
# TODO log a warning | |
continue | |
if version in requirement.specifier: | |
yield url | |
async def get_from_index(index: URL, requirements: Iterator[Requirement]) -> Mapping[Requirement, Container[URL]]: | |
reqs: Dict[Requirement, Set[URL]] = defaultdict(set) | |
async with aiohttp.ClientSession() as session: | |
for req in requirements: | |
async with session.get(index.with_path(f"simple/{req.name}")) as response: | |
reqs[req].update({url async for url in filter_by_requirement(req, response)}) | |
return reqs | |
async def main(index: URL, requirements: Iterator[Requirement]) -> None: | |
reqs = await get_from_index(index, (x for x in requirements if x.marker is None or x.marker.evaluate())) | |
for req, candidate_urls in reqs.items(): | |
#pprint(candidate_urls) | |
pprint({req, len(candidate_urls)}) | |
reqs = [ | |
'setuptools', | |
'flake8', | |
'pylint>=2.9 ; python_version >= "3.8"', | |
'requests', | |
'mypy', | |
] | |
if __name__ == "__main__": | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(main(URL("https://pypi.python.org/"), (Requirement(x) for x in reqs))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment