Skip to content

Instantly share code, notes, and snippets.

@kzinmr
Created September 28, 2018 05:16
Show Gist options
  • Save kzinmr/d308aeb96391f4bb40d0fc84cb536a7d to your computer and use it in GitHub Desktop.
Save kzinmr/d308aeb96391f4bb40d0fc84cb536a7d to your computer and use it in GitHub Desktop.
# char-ngram jaccard sim
from nltk import ngrams
from nltk.metrics import jaccard_distance
def ngram(sequence_list, n=3):
return [''.join(grams) for i in range(1, n) for grams in ngrams(sequence_list, i)]
def jaccard_similarity(s1, s2):
if len(set(s1)) and len(set(s2)):
return 1. - jaccard_distance(set(s1), set(s2))
else:
return 0.
def char_overlap(s1, s2, n=3):
cngram1 = ngram(''.join(s1.split(' ')), n=n)
cngram2 = ngram(''.join(s2.split(' ')), n=n)
return jaccard_similarity(cngram1, cngram2)
def word_overlap(s1, s2):
wngram1 = ngram(s1.split(' '), n=1)
wngram2 = ngram(s2.split(' '), n=1)
return jaccard_similarity(wngram1, wngram2)
def sort_by_overlap(q, slist):
olist = [char_overlap(q, s) for s in slist]
solist = sorted(zip(slist, olist), key=lambda x:-x[-1])
return solist
@kzinmr
Copy link
Author

kzinmr commented Jan 6, 2023

def jaccard_similarity(s, t):
    sset, tset = set(s), set(t)
    if len(sset) > 0 and len(tset) > 0:
        intersection = sset.intersection(tset)
        union = sset.union(tset)
        return len(intersection) / len(union)
    else:
        return 0


def ngrams(seq, n):
    return [seq[i : i + n] for i in range(len(seq) - n + 1)]


def ngram(seq, n=3):
    return ["".join(ng) for i in range(n) for ng in ngrams(seq, i)]


def char_overlap(s, t, n=3):
    cngram_s = ngram("".join(s.split(" ")), n)
    cngram_t = ngram("".join(t.split(" ")), n)
    return jaccard_similarity(cngram_s, cngram_t)


def arg_sort_by_overlap(q, slist):
    zipped = [(s, (i, char_overlap(q, s))) for i, s in enumerate(slist)]
    zipped_sorted = sorted(zipped, key=lambda x:-x[-1][-1])
    return [i for (_, (i, _)) in zipped_sorted]


def word_overlap(s, t):
    wngram1 = ngram(s.split(" "), n=1)
    wngram2 = ngram(t.split(" "), n=1)
    return jaccard_similarity(wngram1, wngram2)


def sort_by_overlap(q, slist):
    zipped = [(s, char_overlap(q, s)) for s in slist]
    return sorted(zipped, key=lambda x: -x[-1])

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment