Last active
December 2, 2019 14:08
-
-
Save kokes/1685a9e6792d91594880dc7a9f3bceb2 to your computer and use it in GitHub Desktop.
find potential typosquatters on pypi
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import re | |
import itertools | |
from urllib.request import urlopen | |
from urllib.error import HTTPError | |
import lxml.html | |
import lxml.etree | |
from jellyfish import levenshtein_distance | |
def save_all_packages(): | |
pypi_simple = 'https://pypi.org/simple/' | |
with urlopen(pypi_simple) as r, open('all_packages.txt', 'w', encoding='utf8') as fw: | |
dt = lxml.html.parse(r) | |
doc = dt.getroot() | |
for pkg in doc.iterfind('.//a'): | |
fw.write(pkg.text) | |
fw.write('\n') | |
def read_all_packages(): | |
with open('all_packages.txt') as f: | |
yield from map(str.strip, f) | |
def new_packages(): | |
new_packages = 'https://pypi.org/rss/packages.xml' | |
r = urlopen(new_packages) | |
et = lxml.etree.parse(r).getroot() | |
for guid in et.iterfind('channel/item/guid'): | |
guidtext = guid.text | |
assert guidtext.endswith('/') | |
pkg = guidtext[:-1].rpartition('/')[-1] | |
yield pkg | |
def similar_packages(aset, bset, distance_threshold=2): | |
for a, b in itertools.product(aset, bset): | |
if a == b: | |
continue | |
if abs(len(a) - len(b)) > distance_threshold: | |
continue | |
dist = levenshtein_distance(a, b) | |
if dist <= distance_threshold: | |
yield a, b | |
def latest_stats(pkg): | |
stats_url = f'https://pypistats.org/api/packages/{pkg}/recent' | |
try: | |
with urlopen(stats_url) as r: | |
dt = json.load(r) | |
except HTTPError as e: | |
if e.code == 404: | |
return None | |
raise e | |
return dt['data']['last_day'] | |
if __name__ == '__main__': | |
popularity_threshold = 10000 | |
# save_all_packages() # TODO | |
dt = similar_packages(read_all_packages(), new_packages(), 1) | |
errs = [] | |
for existing, newcoming in dt: | |
stats = latest_stats(existing) | |
if stats and stats > popularity_threshold: | |
errs.append(f'Package {newcoming} is similar in name to {existing} ({stats} downloads in the last day)') | |
if len(errs) > 0: | |
print(errs) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment