Skip to content

Instantly share code, notes, and snippets.

@oxydron
Last active September 10, 2019 01:01
Show Gist options
  • Save oxydron/0df799bfd9c188b2a64505d56020c6d6 to your computer and use it in GitHub Desktop.
Save oxydron/0df799bfd9c188b2a64505d56020c6d6 to your computer and use it in GitHub Desktop.
def tokenizer(p, n=2):
p = '$' + p +'#'
tk_index = {}
tokens = []
for i in range(len(p)-1):
tk = p[i:i+n]
if tk in tk_index:
tk_index[tk] += 1
tokens.append(tk+str(tk_index[tk]))
else:
tk_index[tk] = 1
tokens.append(tk+str(tk_index[tk]))
return tokens
print(tokenizer('admin_pedrinho'))
print(tokenizer('arara'))
def jaccard(a,b):
"""https://www.wikiwand.com/en/Jaccard_index"""
a = set(tokenizer(a))
b = set(tokenizer(b))
return len(a.intersection(b))/float(len(a.union(b)))
print(jaccard('api','apicultor'))
print(jaccard('api','_api'))
print(jaccard('api','_api_'))
print(jaccard('api','apiu'))
print(jaccard('api','api_'))
print(jaccard('api','a_p_i'))
print(jaccard('api','a_p_i____pedringo'))
import re
# regex cabulosão do tio bênis
p = re.compile(r'^[_-]a[_-]?p[_-]?i|a[_-]?p[_-]?i[_-]|^a[_-]?p[_-]?i$')
print(p.match('apicultor'))
print(p.match('_api'))
print(p.match('_api_'))
print(p.match('api_'))
print(p.match('a_p_i'))
print(p.match('a_p_i____pedringo'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment