Skip to content

Instantly share code, notes, and snippets.

@pelson
Forked from dstufft/pep691.py
Created June 13, 2022 11:47
Show Gist options
  • Save pelson/d44ab5a9ca2dd77d37c4859a5f750a29 to your computer and use it in GitHub Desktop.
Save pelson/d44ab5a9ca2dd77d37c4859a5f750a29 to your computer and use it in GitHub Desktop.
import requests
import zlib
import struct
import json
import html5lib
import re
import pprint
_gzip_header = b"\x1f\x8b\x08\x00\x00\x00\x00\x00\x02\xff"
def gzip_app_iter(app_iter): # Taken from WebOb, which Warehouse uses
size = 0
crc = zlib.crc32(b"") & 0xFFFFFFFF
compress = zlib.compressobj(
9, zlib.DEFLATED, -zlib.MAX_WBITS, zlib.DEF_MEM_LEVEL, 0
)
yield _gzip_header
for item in app_iter:
size += len(item)
crc = zlib.crc32(item, crc) & 0xFFFFFFFF
# The compress function may return zero length bytes if the input is
# small enough; it buffers the input for the next iteration or for a
# flush.
result = compress.compress(item)
if result:
yield result
# Similarly, flush may also not yield a value.
result = compress.flush()
if result:
yield result
yield struct.pack("<2L", crc, size & 0xFFFFFFFF)
def normalize(name): # Taken from PEP 503
return re.sub(r"[-_.]+", "-", name).lower()
data = {}
resp = requests.get("https://pypi.org/simple", stream=True)
resp.raise_for_status()
data["current.compressed"] = 0
for chunk in resp.raw.stream(1024, decode_content=False):
data["current.compressed"] += len(chunk)
resp = requests.get("https://pypi.org/simple")
resp.raise_for_status()
data["current.uncompressed"] = len(resp.content)
jdata = {"meta": {"api-version": "1.0"}, "projects": {}}
jdata2 = {"meta": {"api-version": "1.0"}, "projects": {}}
html = html5lib.parse(resp.content, namespaceHTMLElements=False)
for link in html.findall(".//a"):
jdata["projects"][normalize(link.text)] = {"url": link.attrib["href"]}
jdata2["projects"][normalize(link.text)] = {
"name": link.text,
"url": link.attrib["href"],
}
jcontent = json.dumps(jdata, sort_keys=True, separators=(",", ":")).encode("utf8")
jcontent2 = json.dumps(jdata2, sort_keys=True, separators=(",", ":")).encode("utf8")
data["pep691.uncompressed"] = len(jcontent)
data["pep691.name.uncompressed"] = len(jcontent2)
data["pep691.compressed"] = sum(map(len, list(gzip_app_iter([jcontent]))))
data["pep691.name.compressed"] = sum(map(len, list(gzip_app_iter([jcontent2]))))
pprint.pprint(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment