Created
November 22, 2018 10:09
-
-
Save sharanry/decf976190b668bf3d115ef6b3e7c237 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import nltk\n", | |
"nltk.download('stopwords')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder\n", | |
"from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures\n", | |
"from nltk.corpus import stopwords\n", | |
"stop_words = set(stopwords.words('english'))\n", | |
"import nltk\n", | |
"import csv\n", | |
"import re" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with open('data/travel/quora.txt') as f:\n", | |
" words_quora = [word for line in f for word in line.split()]\n", | |
"\n", | |
"with open('data/travel/wikihow.txt') as f:\n", | |
" words_wiki = [re.sub(r'[^\\w\\s]', '', word) for line in f for word in line.split()]\n", | |
"\n", | |
"with open('data/travel/stackexchange.txt') as f:\n", | |
" words_stackexchange = [re.sub(r'[^\\w\\s]', '', word) for line in f for word in line.split()]\n", | |
"\n", | |
"with open('data/travel/reddit.txt') as f:\n", | |
" words_reddit = [re.sub(r'[^\\w\\s]', '', word) for line in f for word in line.split()]\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"wiki_sentence = []\n", | |
"for w in words_wiki:\n", | |
" if w.lower() not in stop_words:\n", | |
" wiki_sentence.append(w.lower())\n", | |
"\n", | |
"quora_sentence = []\n", | |
"for w in words_quora:\n", | |
" if w.lower() not in stop_words:\n", | |
" quora_sentence.append(w.lower())\n", | |
"\n", | |
"stackexchange_sentence = []\n", | |
"for w in words_stackexchange:\n", | |
" if w.lower() not in stop_words:\n", | |
" stackexchange_sentence.append(w.lower())\n", | |
"\n", | |
"reddit_sentence = []\n", | |
"for w in words_reddit:\n", | |
" if w.lower() not in stop_words:\n", | |
" reddit_sentence.append(w.lower())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(['act', 'white', 'house', 'tour', 'arrange'],\n", | |
" ['best', 'travel', 'website', 'spain', 'ever'],\n", | |
" ['traveling', 'us', 'via', 'lhr', 'duplicate'],\n", | |
" ['data', 'visualization', 'competition', 'rdataisbeautiful', 'rtravel'])" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"wiki_sentence[:5], quora_sentence[:5],stackexchange_sentence[:5], reddit_sentence[:5]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"finder = TrigramCollocationFinder.from_words(wiki_sentence)\n", | |
"trigram_wiki = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)\n", | |
"finder = TrigramCollocationFinder.from_words(quora_sentence)\n", | |
"trigram_quora = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)\n", | |
"\n", | |
"finder = BigramCollocationFinder.from_words(wiki_sentence)\n", | |
"bigram_wiki = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)\n", | |
"finder = BigramCollocationFinder.from_words(quora_sentence)\n", | |
"bigram_quora = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)\n", | |
"\n", | |
"finder = BigramCollocationFinder.from_words(reddit_sentence)\n", | |
"bigram_reddit = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)\n", | |
"finder = BigramCollocationFinder.from_words(stackexchange_sentence)\n", | |
"bigram_stackexchange = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)\n", | |
"\n", | |
"finder = TrigramCollocationFinder.from_words(reddit_sentence)\n", | |
"trigram_reddit = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)\n", | |
"finder = TrigramCollocationFinder.from_words(stackexchange_sentence)\n", | |
"trigram_stackexchange = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"([('new', 'york', 'city'),\n", | |
" ('inexpensively', 'new', 'york'),\n", | |
" ('wander', 'new', 'york'),\n", | |
" ('visit', 'new', 'york'),\n", | |
" ('la', 'new', 'york')],\n", | |
" [('new', 'york'),\n", | |
" ('san', 'francisco'),\n", | |
" ('road', 'trip'),\n", | |
" ('united', 'states'),\n", | |
" ('washington', 'dc')])" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"trigram_wiki[:5], bigram_wiki[:5]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"universal = trigram_wiki + bigram_wiki + trigram_quora + bigram_quora + trigram_stackexchange + bigram_stackexchange + trigram_reddit + bigram_reddit" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import spacy\n", | |
"nlp = spacy.load('en_core_web_lg')\n", | |
"import math\n", | |
"from subprocess import Popen, PIPE" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"vectors300 = [nlp(' '.join(i)).vector for i in universal]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 58, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"23\n" | |
] | |
} | |
], | |
"source": [ | |
"from sklearn.cluster import MeanShift\n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"shift = MeanShift(bandwidth=4, n_jobs=-1)\n", | |
"shift.fit(vectors300)\n", | |
"\n", | |
"meanshift_df = pd.DataFrame({\n", | |
" \"phrase\": universal,\n", | |
" \"label\": shift.labels_,\n", | |
" \"vector\": vectors300\n", | |
"})\n", | |
"print(len(shift.cluster_centers_))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 60, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"No of points: 104\n", | |
"0 (new, york, city)\n", | |
"1 (inexpensively, new, york)\n", | |
"2 (wander, new, york)\n", | |
"3 (visit, new, york)\n", | |
"4 (la, new, york)\n", | |
"5 (bus, new, york)\n", | |
"6 (miami, new, york)\n", | |
"7 (experience, new, york)\n", | |
"8 (new, york, visit)\n", | |
"9 (new, york, cross)\n", | |
"10 (new, york, state)\n", | |
"11 (see, new, york)\n", | |
"12 (navigate, new, york)\n", | |
"13 (island, new, york)\n", | |
"14 (new, york, spend)\n", | |
"15 (francisco, new, york)\n", | |
"16 (tour, new, york)\n", | |
"17 (new, york, ride)\n", | |
"18 (tourist, new, york)\n", | |
"19 (trip, new, york)\n", | |
"20 (new, york)\n", | |
"22 (road, trip)\n", | |
"25 (get, married)\n", | |
"26 (york, city)\n", | |
"31 (time, trip)\n", | |
"32 (get, around)\n", | |
"33 (plan, trip)\n", | |
"37 (new, zealand)\n", | |
"39 (three, days)\n", | |
"40 (invited, someones, home)\n", | |
" ... \n", | |
"116 (green, card)\n", | |
"117 (public, transport)\n", | |
"118 (6, months)\n", | |
"120 (first, time, traveller)\n", | |
"121 (today, first, time)\n", | |
"122 (first, time, visiting)\n", | |
"123 (first, time, going)\n", | |
"124 (dealt, first, time)\n", | |
"125 (first, time, roadtrips)\n", | |
"126 (zion, national, park)\n", | |
"127 (chinese, first, time)\n", | |
"128 (first, time, dubai)\n", | |
"129 (start, first, time)\n", | |
"130 (alone, first, time)\n", | |
"131 (dui, first, time)\n", | |
"132 (night, first, time)\n", | |
"133 (europe, first, time)\n", | |
"134 (first, time, traveler)\n", | |
"135 (first, time, norway)\n", | |
"136 (canyon, first, time)\n", | |
"137 (first, time, 10)\n", | |
"138 (first, time, early)\n", | |
"139 (2019, first, time)\n", | |
"140 (first, time)\n", | |
"141 (national, park)\n", | |
"146 (grand, canyon)\n", | |
"147 (road, trip)\n", | |
"148 (new, zealand)\n", | |
"153 (need, advice)\n", | |
"159 (credit, card)\n", | |
"Name: phrase, Length: 104, dtype: object\n", | |
"No of points: 30\n", | |
"29 (visa, apply)\n", | |
"34 (tourist, visa)\n", | |
"80 (schengen, visa, application)\n", | |
"81 (schengen, visa, refusal)\n", | |
"82 (uk, schengen, visa)\n", | |
"83 (need, schengen, visa)\n", | |
"84 (apply, schengen, visa)\n", | |
"85 (applying, schengen, visa)\n", | |
"86 (schengen, visa, refused)\n", | |
"87 (schengen, visa, waiver)\n", | |
"88 (schengen, visa, duplicate)\n", | |
"89 (multipleentry, schengen, visa)\n", | |
"90 (schengen, visa, issued)\n", | |
"91 (schengen, visa, requirements)\n", | |
"92 (schengen, visa, required)\n", | |
"93 (schengen, visa, expires)\n", | |
"94 (singleentry, schengen, visa)\n", | |
"95 (get, schengen, visa)\n", | |
"96 (transit, visa, application)\n", | |
"97 (visa, schengen, visa)\n", | |
"98 (valid, schengen, visa)\n", | |
"99 (us, schengen, visa)\n", | |
"100 (schengen, visa)\n", | |
"101 (transit, visa)\n", | |
"102 (visitor, visa)\n", | |
"104 (visa, application)\n", | |
"105 (tourist, visa)\n", | |
"109 (schengen, area)\n", | |
"110 (uk, visa)\n", | |
"114 (visa, refusal)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 3\n", | |
"28 (hong, kong)\n", | |
"103 (hong, kong)\n", | |
"144 (hong, kong)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 2\n", | |
"23 (united, states)\n", | |
"111 (united, states)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"119 (connecting, flight)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"152 (renting, car)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"155 (cesky, krumlov)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"38 (hotel, room)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"30 (international, airport)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"158 (machu, picchu)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"106 (residence, permit)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"157 (isle, skye)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"156 (hidden, gems)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"154 (base, camp)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"151 (late, december)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"35 (oahu, hawaii)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"24 (washington, dc)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 3\n", | |
"21 (san, francisco)\n", | |
"36 (los, angeles)\n", | |
"150 (san, francisco)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"149 (costa, rica)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"143 (cinque, terre)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"142 (sri, lanka)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"145 (global, entry)\n", | |
"Name: phrase, dtype: object\n", | |
"No of points: 1\n", | |
"27 (las, vegas)\n", | |
"Name: phrase, dtype: object\n" | |
] | |
} | |
], | |
"source": [ | |
"for i in range(len(shift.cluster_centers_)):\n", | |
" print(\"No of points: \"+ str(np.sum(meanshift_df[\"label\"]==i)))\n", | |
" print(meanshift_df[meanshift_df[\"label\"]==i][\"phrase\"])" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python (GPUReady)", | |
"language": "python", | |
"name": "gpuready" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment