Skip to content

Instantly share code, notes, and snippets.

@kokes
Last active December 2, 2019 14:08
Show Gist options
  • Save kokes/1685a9e6792d91594880dc7a9f3bceb2 to your computer and use it in GitHub Desktop.
Save kokes/1685a9e6792d91594880dc7a9f3bceb2 to your computer and use it in GitHub Desktop.
find potential typosquatters on pypi
import json
import re
import itertools
from urllib.request import urlopen
from urllib.error import HTTPError
import lxml.html
import lxml.etree
from jellyfish import levenshtein_distance
def save_all_packages():
pypi_simple = 'https://pypi.org/simple/'
with urlopen(pypi_simple) as r, open('all_packages.txt', 'w', encoding='utf8') as fw:
dt = lxml.html.parse(r)
doc = dt.getroot()
for pkg in doc.iterfind('.//a'):
fw.write(pkg.text)
fw.write('\n')
def read_all_packages():
with open('all_packages.txt') as f:
yield from map(str.strip, f)
def new_packages():
new_packages = 'https://pypi.org/rss/packages.xml'
r = urlopen(new_packages)
et = lxml.etree.parse(r).getroot()
for guid in et.iterfind('channel/item/guid'):
guidtext = guid.text
assert guidtext.endswith('/')
pkg = guidtext[:-1].rpartition('/')[-1]
yield pkg
def similar_packages(aset, bset, distance_threshold=2):
for a, b in itertools.product(aset, bset):
if a == b:
continue
if abs(len(a) - len(b)) > distance_threshold:
continue
dist = levenshtein_distance(a, b)
if dist <= distance_threshold:
yield a, b
def latest_stats(pkg):
stats_url = f'https://pypistats.org/api/packages/{pkg}/recent'
try:
with urlopen(stats_url) as r:
dt = json.load(r)
except HTTPError as e:
if e.code == 404:
return None
raise e
return dt['data']['last_day']
if __name__ == '__main__':
popularity_threshold = 10000
# save_all_packages() # TODO
dt = similar_packages(read_all_packages(), new_packages(), 1)
errs = []
for existing, newcoming in dt:
stats = latest_stats(existing)
if stats and stats > popularity_threshold:
errs.append(f'Package {newcoming} is similar in name to {existing} ({stats} downloads in the last day)')
if len(errs) > 0:
print(errs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment