Skip to content

Instantly share code, notes, and snippets.

@pietz
Last active November 19, 2022 17:20
Show Gist options
  • Save pietz/d6197f64c34d273a6d456d7b736c028d to your computer and use it in GitHub Desktop.
Save pietz/d6197f64c34d273a6d456d7b736c028d to your computer and use it in GitHub Desktop.
import re
from unidecode import unidecode
def fingerprint(string):
# change all characters to their lowercase representation
string = string.lower()
# remove all punctuation and control characters
string = re.sub("[^A-Za-z0-9 ]+", "", string)
# normalize extended western characters to their ASCII representation
string = unidecode(string)
# split the string into whitespace-separated tokens
words = string.split()
# sort the tokens and remove duplicates
words = sorted(list(set(words)))
# join the tokens back together
return " ".join(words)
def ngram_fingerprint(string, n=1):
# remove leading and trailing whitespace
string = string.strip()
# change all characters to their lowercase representation
string = string.lower()
# remove all punctuation, whitespace, and control characters
string = re.sub("[^A-Za-z0-9]+", "", string)
# obtain all the string n-grams
ngrams = [string[i : i + n] for i in range(len(string) - n + 1)]
# sort the n-grams and remove duplicates
ngrams = sorted(list(set(ngrams)))
# join the sorted n-grams back together
string = "".join(ngrams)
# normalize extended western characters to their ASCII representation
return unidecode(string)
def mod_ngram_fingerprint(string, n=1):
# change all characters to their lowercase representation
string = string.lower()
# remove all punctuation and control characters
string = re.sub("[^A-Za-z0-9 ]+", "", string)
# normalize extended western characters to their ASCII representation
string = unidecode(string)
# split the string into whitespace-separated tokens
words = string.split()
# sort the tokens and remove duplicates
words = sorted(list(set(words)))
# join the tokens back together
string = "".join(words)
# obtain all the string n-grams
ngrams = [string[i : i + n] for i in range(len(string) - n + 1)]
# sort the n-grams and remove duplicates
ngrams = sorted(list(set(ngrams)))
# join the sorted n-grams back together
return "".join(ngrams)
if __name__ == "__main__":
print(fingerprint("Tom Cruise"))
print(ngram_fingerprint("Tom Cruise", 1))
print(ngram_fingerprint("Tom Cruise", 2))
print(mod_ngram_fingerprint("Tom Cruise", 1))
print(mod_ngram_fingerprint("Tom Cruise", 2))
print("---")
print(fingerprint("Cruise, Tom"))
print(ngram_fingerprint("Cruise, Tom", 1))
print(ngram_fingerprint("Cruise, Tom", 2))
print(mod_ngram_fingerprint("Cruise, Tom", 1))
print(mod_ngram_fingerprint("Cruise, Tom", 2))
print("---")
print(fingerprint("Paris"))
print(ngram_fingerprint("Paris", 1))
print(ngram_fingerprint("Paris", 2))
print(mod_ngram_fingerprint("Paris", 1))
print(mod_ngram_fingerprint("Paris", 2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment