Skip to content

Instantly share code, notes, and snippets.

@emlyn
Last active April 16, 2021 22:14
Show Gist options
  • Select an option

  • Save emlyn/b6ac87ad1dcdc1cfbe427c5a4465e63d to your computer and use it in GitHub Desktop.

Select an option

Save emlyn/b6ac87ad1dcdc1cfbe427c5a4465e63d to your computer and use it in GitHub Desktop.
Bad breaking names
# Requires: pip install unidecode
import gzip
import re
import unidecode
# From https://sciencenotes.org/list-elements-atomic-number/, contains:
# - Atomic number
# - Symbol
# - Name
with open('elementlist.csv', 'rt') as f:
elems = []
for line in f:
elems.append(line.split(',')[1])
any_element = re.compile('|'.join(elems).lower())
alpha = re.compile('[a-z]')
# From https://datasets.imdbws.com/, contains:
# - nconst (string) - alphanumeric unique identifier of the name/person
# - primaryName (string)– name by which the person is most often credited
# - birthYear – in YYYY format
# - deathYear – in YYYY format if applicable, else '\N'
# - primaryProfession (array of strings)– the top-3 professions of the person
# - knownForTitles (array of tconsts) – titles the person is known for
with gzip.open('name.basics.tsv.gz', 'rt') as f:
seen = set()
people = []
for line in f:
id, nm, by, dy, pr, kt = line.split('\t')
plain_nm = unidecode.unidecode(nm).lower()
plain_nm = re.sub('^[^a-z0-9]*', '', plain_nm)
plain_nm = re.sub('[^a-z0-9]{1,}', ' ', plain_nm)
if (nm not in seen) and alpha.search(plain_nm) and (not any_element.search(plain_nm)):
people.append([plain_nm, nm, id, pr])
seen.add(nm)
print("Found {} different names".format(len(people)))
print()
for _, nm, id, pr in sorted(people):
extra = ""
if pr:
extra = " ({})".format(pr.replace(',', ', '))
escaped = re.sub('[][{}()\\`*_#.!+-]', lambda s: '\\' + s[0], nm)
print('- [{}](https://www.imdb.com/{}/){}'.format(escaped, id, extra))

Found 1302 different names

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment