Last active
April 4, 2024 04:33
-
-
Save gaulinmp/da5825de975ed0ea6a24186434c24fe4 to your computer and use it in GitHub Desktop.
N-Gram Similarity Comparison
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-03-21T18:07:17.237596", | |
"start_time": "2017-03-21T18:07:16.630935" | |
}, | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Get Tuple algorithms \n", | |
"import re\n", | |
"import math\n", | |
"import numpy as np\n", | |
"from itertools import chain\n", | |
"from collections import Counter\n", | |
"import nltk\n", | |
"from nltk.util import ngrams # This is the ngram magic.\n", | |
"from textblob import TextBlob\n", | |
"\n", | |
"NGRAM = 4\n", | |
"\n", | |
"re_sent_ends_naive = re.compile(r'[.\\n]')\n", | |
"re_stripper_alpha = re.compile('[^a-zA-Z]+')\n", | |
"re_stripper_naive = re.compile('[^a-zA-Z\\.\\n]')\n", | |
"\n", | |
"splitter_naive = lambda x: re_sent_ends_naive.split(re_stripper_naive.sub(' ', x))\n", | |
"\n", | |
"sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')\n", | |
"\n", | |
"def get_tuples_nosentences(txt):\n", | |
" \"\"\"Get tuples that ignores all punctuation (including sentences).\"\"\"\n", | |
" if not txt: return None\n", | |
" ng = ngrams(re_stripper_alpha.sub(' ', txt).split(), NGRAM)\n", | |
" return list(ng)\n", | |
"\n", | |
"def get_tuples_manual_sentences(txt):\n", | |
" \"\"\"Naive get tuples that uses periods or newlines to denote sentences.\"\"\"\n", | |
" if not txt: return None\n", | |
" sentences = (x.split() for x in splitter_naive(txt) if x)\n", | |
" ng = (ngrams(x, NGRAM) for x in sentences if len(x) >= NGRAM)\n", | |
" return list(chain(*ng))\n", | |
"\n", | |
"def get_tuples_nltk_punkt_sentences(txt):\n", | |
" \"\"\"Get tuples that doesn't use textblob.\"\"\"\n", | |
" if not txt: return None\n", | |
" sentences = (re_stripper_alpha.split(x) for x in sent_detector.tokenize(txt) if x)\n", | |
" # Need to filter X because of empty 'words' from punctuation split\n", | |
" ng = (ngrams(filter(None, x), NGRAM) for x in sentences if len(x) >= NGRAM)\n", | |
" return list(chain(*ng))\n", | |
"\n", | |
"def get_tuples_textblob_sentences(txt):\n", | |
" \"\"\"New get_tuples that does use textblob.\"\"\"\n", | |
" if not txt: return None\n", | |
" tb = TextBlob(txt)\n", | |
" ng = (ngrams(x.words, NGRAM) for x in tb.sentences if len(x.words) > NGRAM)\n", | |
" return [item for sublist in ng for item in sublist]\n", | |
"\n", | |
"def jaccard_distance(a, b):\n", | |
" \"\"\"Calculate the jaccard distance between sets A and B\"\"\"\n", | |
" a = set(a)\n", | |
" b = set(b)\n", | |
" return 1.0 * len(a&b)/len(a|b)\n", | |
"\n", | |
"def cosine_similarity_ngrams(a, b):\n", | |
" vec1 = Counter(a)\n", | |
" vec2 = Counter(b)\n", | |
" \n", | |
" intersection = set(vec1.keys()) & set(vec2.keys())\n", | |
" numerator = sum([vec1[x] * vec2[x] for x in intersection])\n", | |
"\n", | |
" sum1 = sum([vec1[x]**2 for x in vec1.keys()])\n", | |
" sum2 = sum([vec2[x]**2 for x in vec2.keys()])\n", | |
" denominator = math.sqrt(sum1) * math.sqrt(sum2)\n", | |
"\n", | |
" if not denominator:\n", | |
" return 0.0\n", | |
" return float(numerator) / denominator" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Test N-Gram functions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-03-21T18:07:17.240952", | |
"start_time": "2017-03-21T18:07:17.238871" | |
}, | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"paragraph = \"\"\"It was the best of times, it was the worst of times.\n", | |
" It was the age of wisdom? It was the age of foolishness!\n", | |
" I first met Dr. Frankenstein in Munich; his monster was, presumably, at home.\"\"\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-03-21T18:07:17.417021", | |
"start_time": "2017-03-21T18:07:17.390647" | |
}, | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Number of N-grams: 34\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"[('It', 'was', 'the', 'best'),\n", | |
" ('was', 'the', 'best', 'of'),\n", | |
" ('the', 'best', 'of', 'times'),\n", | |
" ('best', 'of', 'times', 'it'),\n", | |
" ('of', 'times', 'it', 'was'),\n", | |
" ('times', 'it', 'was', 'the'),\n", | |
" ('it', 'was', 'the', 'worst'),\n", | |
" ('was', 'the', 'worst', 'of'),\n", | |
" ('the', 'worst', 'of', 'times'),\n", | |
" ('worst', 'of', 'times', 'It'),\n", | |
" ('of', 'times', 'It', 'was'),\n", | |
" ('times', 'It', 'was', 'the'),\n", | |
" ('It', 'was', 'the', 'age'),\n", | |
" ('was', 'the', 'age', 'of'),\n", | |
" ('the', 'age', 'of', 'wisdom'),\n", | |
" ('age', 'of', 'wisdom', 'It'),\n", | |
" ('of', 'wisdom', 'It', 'was'),\n", | |
" ('wisdom', 'It', 'was', 'the'),\n", | |
" ('It', 'was', 'the', 'age'),\n", | |
" ('was', 'the', 'age', 'of'),\n", | |
" ('the', 'age', 'of', 'foolishness'),\n", | |
" ('age', 'of', 'foolishness', 'I'),\n", | |
" ('of', 'foolishness', 'I', 'first'),\n", | |
" ('foolishness', 'I', 'first', 'met'),\n", | |
" ('I', 'first', 'met', 'Dr'),\n", | |
" ('first', 'met', 'Dr', 'Frankenstein'),\n", | |
" ('met', 'Dr', 'Frankenstein', 'in'),\n", | |
" ('Dr', 'Frankenstein', 'in', 'Munich'),\n", | |
" ('Frankenstein', 'in', 'Munich', 'his'),\n", | |
" ('in', 'Munich', 'his', 'monster'),\n", | |
" ('Munich', 'his', 'monster', 'was'),\n", | |
" ('his', 'monster', 'was', 'presumably'),\n", | |
" ('monster', 'was', 'presumably', 'at'),\n", | |
" ('was', 'presumably', 'at', 'home')]" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"_ = get_tuples_nosentences(paragraph);print(\"Number of N-grams:\", len(_));_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-03-21T18:07:17.574897", | |
"start_time": "2017-03-21T18:07:17.564493" | |
}, | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Number of N-grams: 25\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"[('It', 'was', 'the', 'best'),\n", | |
" ('was', 'the', 'best', 'of'),\n", | |
" ('the', 'best', 'of', 'times'),\n", | |
" ('best', 'of', 'times', 'it'),\n", | |
" ('of', 'times', 'it', 'was'),\n", | |
" ('times', 'it', 'was', 'the'),\n", | |
" ('it', 'was', 'the', 'worst'),\n", | |
" ('was', 'the', 'worst', 'of'),\n", | |
" ('the', 'worst', 'of', 'times'),\n", | |
" ('It', 'was', 'the', 'age'),\n", | |
" ('was', 'the', 'age', 'of'),\n", | |
" ('the', 'age', 'of', 'wisdom'),\n", | |
" ('age', 'of', 'wisdom', 'It'),\n", | |
" ('of', 'wisdom', 'It', 'was'),\n", | |
" ('wisdom', 'It', 'was', 'the'),\n", | |
" ('It', 'was', 'the', 'age'),\n", | |
" ('was', 'the', 'age', 'of'),\n", | |
" ('the', 'age', 'of', 'foolishness'),\n", | |
" ('I', 'first', 'met', 'Dr'),\n", | |
" ('Frankenstein', 'in', 'Munich', 'his'),\n", | |
" ('in', 'Munich', 'his', 'monster'),\n", | |
" ('Munich', 'his', 'monster', 'was'),\n", | |
" ('his', 'monster', 'was', 'presumably'),\n", | |
" ('monster', 'was', 'presumably', 'at'),\n", | |
" ('was', 'presumably', 'at', 'home')]" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"_ = get_tuples_manual_sentences(paragraph);print(\"Number of N-grams:\", len(_));_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-03-21T18:07:17.918758", | |
"start_time": "2017-03-21T18:07:17.903904" | |
}, | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Number of N-grams: 25\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"[('It', 'was', 'the', 'best'),\n", | |
" ('was', 'the', 'best', 'of'),\n", | |
" ('the', 'best', 'of', 'times'),\n", | |
" ('best', 'of', 'times', 'it'),\n", | |
" ('of', 'times', 'it', 'was'),\n", | |
" ('times', 'it', 'was', 'the'),\n", | |
" ('it', 'was', 'the', 'worst'),\n", | |
" ('was', 'the', 'worst', 'of'),\n", | |
" ('the', 'worst', 'of', 'times'),\n", | |
" ('It', 'was', 'the', 'age'),\n", | |
" ('was', 'the', 'age', 'of'),\n", | |
" ('the', 'age', 'of', 'wisdom'),\n", | |
" ('It', 'was', 'the', 'age'),\n", | |
" ('was', 'the', 'age', 'of'),\n", | |
" ('the', 'age', 'of', 'foolishness'),\n", | |
" ('I', 'first', 'met', 'Dr'),\n", | |
" ('first', 'met', 'Dr', 'Frankenstein'),\n", | |
" ('met', 'Dr', 'Frankenstein', 'in'),\n", | |
" ('Dr', 'Frankenstein', 'in', 'Munich'),\n", | |
" ('Frankenstein', 'in', 'Munich', 'his'),\n", | |
" ('in', 'Munich', 'his', 'monster'),\n", | |
" ('Munich', 'his', 'monster', 'was'),\n", | |
" ('his', 'monster', 'was', 'presumably'),\n", | |
" ('monster', 'was', 'presumably', 'at'),\n", | |
" ('was', 'presumably', 'at', 'home')]" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"_ = get_tuples_nltk_punkt_sentences(paragraph);print(\"Number of N-grams:\", len(_));_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-03-21T18:07:18.235779", | |
"start_time": "2017-03-21T18:07:18.220070" | |
}, | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Number of N-grams: 25\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"[('It', 'was', 'the', 'best'),\n", | |
" ('was', 'the', 'best', 'of'),\n", | |
" ('the', 'best', 'of', 'times'),\n", | |
" ('best', 'of', 'times', 'it'),\n", | |
" ('of', 'times', 'it', 'was'),\n", | |
" ('times', 'it', 'was', 'the'),\n", | |
" ('it', 'was', 'the', 'worst'),\n", | |
" ('was', 'the', 'worst', 'of'),\n", | |
" ('the', 'worst', 'of', 'times'),\n", | |
" ('It', 'was', 'the', 'age'),\n", | |
" ('was', 'the', 'age', 'of'),\n", | |
" ('the', 'age', 'of', 'wisdom'),\n", | |
" ('It', 'was', 'the', 'age'),\n", | |
" ('was', 'the', 'age', 'of'),\n", | |
" ('the', 'age', 'of', 'foolishness'),\n", | |
" ('I', 'first', 'met', 'Dr'),\n", | |
" ('first', 'met', 'Dr', 'Frankenstein'),\n", | |
" ('met', 'Dr', 'Frankenstein', 'in'),\n", | |
" ('Dr', 'Frankenstein', 'in', 'Munich'),\n", | |
" ('Frankenstein', 'in', 'Munich', 'his'),\n", | |
" ('in', 'Munich', 'his', 'monster'),\n", | |
" ('Munich', 'his', 'monster', 'was'),\n", | |
" ('his', 'monster', 'was', 'presumably'),\n", | |
" ('monster', 'was', 'presumably', 'at'),\n", | |
" ('was', 'presumably', 'at', 'home')]" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"_ = get_tuples_textblob_sentences(paragraph);print(\"Number of N-grams:\", len(_));_" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Test Similarity" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-03-21T18:07:18.873284", | |
"start_time": "2017-03-21T18:07:18.865795" | |
}, | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Jaccard: 0.0 Cosine: 0.0\n" | |
] | |
} | |
], | |
"source": [ | |
"a = get_tuples_nosentences(\"It was the best of times.\")\n", | |
"b = get_tuples_nosentences(\"It was the worst of times.\")\n", | |
"print(\"Jaccard: {} Cosine: {}\".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-03-21T18:07:19.188519", | |
"start_time": "2017-03-21T18:07:19.181992" | |
}, | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Jaccard: 0.2 Cosine: 0.33333333333333337\n" | |
] | |
} | |
], | |
"source": [ | |
"a = get_tuples_nosentences(\"Above is a bad example of four-gram similarity.\")\n", | |
"b = get_tuples_nosentences(\"This is a better example of four-gram similarity.\")\n", | |
"print(\"Jaccard: {} Cosine: {}\".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2017-03-21T18:07:19.499491", | |
"start_time": "2017-03-21T18:07:19.493078" | |
}, | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Jaccard: 0.14285714285714285 Cosine: 0.5714285714285714\n" | |
] | |
} | |
], | |
"source": [ | |
"a = get_tuples_nosentences(\"Jaccard Index ignores repetition repetition repetition repetition repetition.\")\n", | |
"b = get_tuples_nosentences(\"Cosine similarity weighs repetition repetition repetition repetition repetition.\")\n", | |
"print(\"Jaccard: {} Cosine: {}\".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))" | |
] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [default]", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Get Tuple algorithms | |
import re | |
import math | |
import numpy as np | |
from itertools import chain | |
from collections import Counter | |
import nltk | |
from nltk.util import ngrams # This is the ngram magic. | |
from textblob import TextBlob | |
NGRAM = 4 | |
re_sent_ends_naive = re.compile(r'[.\n]') | |
re_stripper_alpha = re.compile('[^a-zA-Z]+') | |
re_stripper_naive = re.compile('[^a-zA-Z\.\n]') | |
splitter_naive = lambda x: re_sent_ends_naive.split(re_stripper_naive.sub(' ', x)) | |
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') | |
def get_tuples_nosentences(txt): | |
"""Get tuples that ignores all punctuation (including sentences).""" | |
if not txt: return None | |
ng = ngrams(re_stripper_alpha.sub(' ', txt).split(), NGRAM) | |
return list(ng) | |
def get_tuples_manual_sentences(txt): | |
"""Naive get tuples that uses periods or newlines to denote sentences.""" | |
if not txt: return None | |
sentences = (x.split() for x in splitter_naive(txt) if x) | |
ng = (ngrams(x, NGRAM) for x in sentences if len(x) >= NGRAM) | |
return list(chain(*ng)) | |
def get_tuples_nltk_punkt_sentences(txt): | |
"""Get tuples that doesn't use textblob.""" | |
if not txt: return None | |
sentences = (re_stripper_alpha.split(x) for x in sent_detector.tokenize(txt) if x) | |
# Need to filter X because of empty 'words' from punctuation split | |
ng = (ngrams(filter(None, x), NGRAM) for x in sentences if len(x) >= NGRAM) | |
return list(chain(*ng)) | |
def get_tuples_textblob_sentences(txt): | |
"""New get_tuples that does use textblob.""" | |
if not txt: return None | |
tb = TextBlob(txt) | |
ng = (ngrams(x.words, NGRAM) for x in tb.sentences if len(x.words) > NGRAM) | |
return [item for sublist in ng for item in sublist] | |
def jaccard_distance(a, b): | |
"""Calculate the jaccard distance between sets A and B""" | |
a = set(a) | |
b = set(b) | |
return 1.0 * len(a&b)/len(a|b) | |
def cosine_similarity_ngrams(a, b): | |
vec1 = Counter(a) | |
vec2 = Counter(b) | |
intersection = set(vec1.keys()) & set(vec2.keys()) | |
numerator = sum([vec1[x] * vec2[x] for x in intersection]) | |
sum1 = sum([vec1[x]**2 for x in vec1.keys()]) | |
sum2 = sum([vec2[x]**2 for x in vec2.keys()]) | |
denominator = math.sqrt(sum1) * math.sqrt(sum2) | |
if not denominator: | |
return 0.0 | |
return float(numerator) / denominator | |
def test(): | |
paragraph = """It was the best of times, it was the worst of times. | |
It was the age of wisdom? It was the age of foolishness! | |
I first met Dr. Frankenstein in Munich; his monster was, presumably, at home.""" | |
print(paragraph) | |
_ = get_tuples_nosentences(paragraph);print("Number of N-grams (no sentences):", len(_));_ | |
_ = get_tuples_manual_sentences(paragraph);print("Number of N-grams (naive sentences):", len(_));_ | |
_ = get_tuples_nltk_punkt_sentences(paragraph);print("Number of N-grams (nltk sentences):", len(_));_ | |
_ = get_tuples_textblob_sentences(paragraph);print("Number of N-grams (TextBlob sentences):", len(_));_ | |
a = get_tuples_nosentences("It was the best of times.") | |
b = get_tuples_nosentences("It was the worst of times.") | |
print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b))) | |
a = get_tuples_nosentences("Above is a bad example of four-gram similarity.") | |
b = get_tuples_nosentences("This is a better example of four-gram similarity.") | |
print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b))) | |
a = get_tuples_nosentences("Jaccard Index ignores repetition repetition repetition repetition repetition.") | |
b = get_tuples_nosentences("Cosine similarity weighs repetition repetition repetition repetition repetition.") | |
print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b))) | |
test() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment