Created
November 23, 2017 11:01
-
-
Save kawa-kokosowa/59da8e964bdc26f46ee3e3164c6a8b4d to your computer and use it in GitHub Desktop.
Jaccard Similarity of Bags (python3; builtin only)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Jaccard Similarity of Bags | |
Using artificial restriciton of using only builtin Python 3 only. | |
""" | |
import doctest | |
def multiset_intersection_cardinality(x: list, y: list) -> int: | |
"""Returns the number of elements of x and y intersection.""" | |
cardinality = 0 | |
fewest, most = (x, y) if len(x) < len(y) else (y, x) | |
most = most.copy() | |
for value in fewest: | |
try: | |
most.remove(value) | |
except ValueError: | |
pass | |
else: | |
cardinality += 1 | |
return cardinality | |
def multiset_union_cardinality(x: list, y: list) -> int: | |
"""Return the number of elements in both x and y.""" | |
return len(x) + len(y) | |
def jaccard_similarity_bags(x: list, y: list) -> float: | |
"""Get the Jaccard similarity of two bags (aka multisets). | |
Example: | |
>>> jaccard_similarity_bags([1,1,1,2], [1,1,2,2,3]) | |
0.3333333333333333 | |
>>> jaccard_similarity_bags([1,1,1,2], [1,2,3,4]) | |
0.25 | |
>>> jaccard_similarity_bags([1,1,2,2,3], [1,2,3,4]) | |
0.3333333333333333 | |
""" | |
intersection_cardinality = multiset_intersection_cardinality(x, y) | |
union_cardinality = multiset_union_cardinality(x, y) | |
return ( | |
intersection_cardinality | |
/ union_cardinality | |
) | |
if __name__ == "__main__": | |
doctest.testmod() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The docstring for multiset_union_cardinality is wrong