Last active
June 26, 2022 00:34
-
-
Save nmichlo/0ccf97a6efedcb75ff421f5a9d240e9d to your computer and use it in GitHub Desktop.
available pypi synonyms
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# check for available pypi synonyms of a given word | |
# $ available_pypi_synonyms.py <words...> | |
import argparse | |
import itertools | |
import re | |
import diskcache as dc | |
import requests | |
from bs4 import BeautifulSoup | |
# ========================================================================= # | |
# Ansi Colors # | |
# ========================================================================= # | |
RST = '\033[0m' | |
# dark colors | |
GRY = '\033[90m' | |
lRED = '\033[91m' | |
lGRN = '\033[92m' | |
lYLW = '\033[93m' | |
lBLU = '\033[94m' | |
lMGT = '\033[95m' | |
lCYN = '\033[96m' | |
WHT = '\033[97m' | |
# light colors | |
BLK = '\033[30m' | |
RED = '\033[31m' | |
GRN = '\033[32m' | |
YLW = '\033[33m' | |
BLU = '\033[34m' | |
MGT = '\033[35m' | |
CYN = '\033[36m' | |
lGRY = '\033[37m' | |
# ========================================================================= # | |
# Time # | |
# ========================================================================= # | |
SEC = 1 | |
MIN = SEC * 60 | |
HOUR = MIN * 60 | |
DAY = HOUR * 24 | |
WEEK = DAY * 7 | |
# ========================================================================= # | |
# UTIL # | |
# ========================================================================= # | |
# cache data so we dont need to make multiple requests | |
_CACHE = dc.Cache('_cache_/synonyms') | |
# fake a request from a browser | |
_HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'} | |
@_CACHE.memoize(expire=1*WEEK) | |
def fetch_page_content(url): | |
page = requests.get(url, headers=_HEADERS) | |
return page | |
@_CACHE.memoize(expire=1*DAY) | |
def get_status_code(url): | |
return requests.head(url, headers=_HEADERS).status_code | |
def normalize(name): | |
# https://www.python.org/dev/peps/pep-0503/#normalized-names | |
return re.sub(r"[-_. ]+", "-", name).lower() | |
# ========================================================================= # | |
# Synonyms # | |
# ========================================================================= # | |
def fetch_synonyms_page_words(word: str, page=1): | |
# normalise the word | |
normalised_word = normalize(word) | |
assert normalised_word.isidentifier(), f'normalised word: {repr(word)} -> {repr(normalised_word)} is not an isidentifier!' | |
# fetch the content | |
url = 'https://www.powerthesaurus.org/{word}/synonyms/{page}'.format(word=normalised_word, page=page) | |
page = fetch_page_content(url) | |
assert page.status_code == 200, f'synonyms response returned wrong status_code: {page.status_code}, should be 200' | |
# parse the page & extract synonyms | |
soup = BeautifulSoup(page.content, 'html.parser') | |
synonym_blocks = soup.find_all('div', id="primary-area", recursive=True) | |
synonyms = [block.find('a').text for block in synonym_blocks] | |
# status | |
return synonyms | |
def yield_synonyms(word: str, pages=1): | |
unique = set() | |
# for each page, check that added words are unique | |
for page in range(pages): | |
words = fetch_synonyms_page_words(word=word, page=page+1) | |
if not (set(words) - unique): | |
break | |
for word in words: | |
if word not in unique: | |
yield word | |
unique.add(word) | |
# ========================================================================= # | |
# Pypi # | |
# ========================================================================= # | |
def yield_pypi_synonyms(word, pages=1): | |
for synonym in itertools.chain([word], yield_synonyms(word, pages=pages)): | |
normalized_synonym = normalize(synonym) | |
status_code = get_status_code('https://pypi.org/project/{project}/'.format(project=normalized_synonym)) | |
yield (normalized_synonym, status_code == 404, status_code) | |
def print_pypi_synonyms(words, pages=1, available_only=False, show_status=False): | |
for word in words: | |
title = f'Synonyms for: {repr(word)}:' | |
# print heading | |
print('='*len(title)) | |
print(title) | |
print('='*len(title)) | |
print() | |
# print words | |
for i, (synonym, exists, status_code) in enumerate(yield_pypi_synonyms(word, pages=pages)): | |
status = f" [{status_code:3d}]" if show_status else "" | |
if exists: | |
print(f'{i:3d}: ✅{status} {lGRN}{synonym}{RST}') | |
elif not available_only: | |
print(f'{i:3d}: ❌{status} {RED}{synonym}{RST}') | |
# end | |
print() | |
# ========================================================================= # | |
# Entrypoint # | |
# ========================================================================= # | |
if __name__ == '__main__': | |
DEFAULT_WORDS = ['default'] | |
DEFAULT_PAGES = 1 | |
# parse arguments | |
parser = argparse.ArgumentParser() | |
parser.add_argument('words', type=str, nargs='*', default=DEFAULT_WORDS) | |
parser.add_argument('-p', '--pages', type=bool, default=DEFAULT_PAGES) | |
parser.add_argument('-a', '--available', action='store_true') | |
parser.add_argument('-s', '--status', action='store_true') | |
args = parser.parse_args() | |
# print everything | |
print_pypi_synonyms(words=args.words, pages=args.pages, available_only=args.available, show_status=args.status) | |
# ========================================================================= # | |
# END # | |
# ========================================================================= # |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment