Last active
December 4, 2018 06:15
-
-
Save kmwenja/468d5a3c57b42d8ea66081128a4a57ee to your computer and use it in GitHub Desktop.
N-Gram Similarity
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
An experiment doing n-gram similarity. | |
Usage: | |
`$ python trigram.py '<left string>' '<right string>' <number of letters in a gram>` | |
Example: | |
``` | |
$ python trigram.py 'hello' 'hallo' 3 | |
Left to Right Similarity: 0.5 | |
Right to Left Similarity: 0.5 | |
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
def grammify(s, maxn=3): | |
s = s.replace(' ', '') | |
grams = [] | |
for n in range(1, len(s)): | |
if n > maxn: | |
break | |
for i in range(len(s)): | |
gram = s[i:i+n] | |
if len(gram) == n: | |
grams.append(gram) | |
return grams | |
def main(): | |
s1 = sys.argv[1] | |
s2 = sys.argv[2] | |
maxn = int(sys.argv[3]) | |
left = grammify(s1, maxn=maxn) | |
right = grammify(s2, maxn=maxn) | |
common = set(left).intersection(set(right)) | |
# print("Left grams:", left) | |
# print("Right grams:", right) | |
# print("Common grams:", common) | |
# how similar is s1 to s2 | |
right_score = len(common) / len(right) | |
print("Left to Right Similarity:", right_score) | |
# how similar is s2 to s1 | |
left_score = len(common) / len(left) | |
print("Right to Left Similarity:", left_score) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment