Skip to content

Instantly share code, notes, and snippets.

@sharanry
Created November 22, 2018 10:09
Show Gist options
  • Save sharanry/decf976190b668bf3d115ef6b3e7c237 to your computer and use it in GitHub Desktop.
Save sharanry/decf976190b668bf3d115ef6b3e7c237 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"nltk.download('stopwords')"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder\n",
"from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures\n",
"from nltk.corpus import stopwords\n",
"stop_words = set(stopwords.words('english'))\n",
"import nltk\n",
"import csv\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"with open('data/travel/quora.txt') as f:\n",
" words_quora = [word for line in f for word in line.split()]\n",
"\n",
"with open('data/travel/wikihow.txt') as f:\n",
" words_wiki = [re.sub(r'[^\\w\\s]', '', word) for line in f for word in line.split()]\n",
"\n",
"with open('data/travel/stackexchange.txt') as f:\n",
" words_stackexchange = [re.sub(r'[^\\w\\s]', '', word) for line in f for word in line.split()]\n",
"\n",
"with open('data/travel/reddit.txt') as f:\n",
" words_reddit = [re.sub(r'[^\\w\\s]', '', word) for line in f for word in line.split()]\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"wiki_sentence = []\n",
"for w in words_wiki:\n",
" if w.lower() not in stop_words:\n",
" wiki_sentence.append(w.lower())\n",
"\n",
"quora_sentence = []\n",
"for w in words_quora:\n",
" if w.lower() not in stop_words:\n",
" quora_sentence.append(w.lower())\n",
"\n",
"stackexchange_sentence = []\n",
"for w in words_stackexchange:\n",
" if w.lower() not in stop_words:\n",
" stackexchange_sentence.append(w.lower())\n",
"\n",
"reddit_sentence = []\n",
"for w in words_reddit:\n",
" if w.lower() not in stop_words:\n",
" reddit_sentence.append(w.lower())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(['act', 'white', 'house', 'tour', 'arrange'],\n",
" ['best', 'travel', 'website', 'spain', 'ever'],\n",
" ['traveling', 'us', 'via', 'lhr', 'duplicate'],\n",
" ['data', 'visualization', 'competition', 'rdataisbeautiful', 'rtravel'])"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wiki_sentence[:5], quora_sentence[:5],stackexchange_sentence[:5], reddit_sentence[:5]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"finder = TrigramCollocationFinder.from_words(wiki_sentence)\n",
"trigram_wiki = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)\n",
"finder = TrigramCollocationFinder.from_words(quora_sentence)\n",
"trigram_quora = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)\n",
"\n",
"finder = BigramCollocationFinder.from_words(wiki_sentence)\n",
"bigram_wiki = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)\n",
"finder = BigramCollocationFinder.from_words(quora_sentence)\n",
"bigram_quora = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)\n",
"\n",
"finder = BigramCollocationFinder.from_words(reddit_sentence)\n",
"bigram_reddit = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)\n",
"finder = BigramCollocationFinder.from_words(stackexchange_sentence)\n",
"bigram_stackexchange = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)\n",
"\n",
"finder = TrigramCollocationFinder.from_words(reddit_sentence)\n",
"trigram_reddit = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)\n",
"finder = TrigramCollocationFinder.from_words(stackexchange_sentence)\n",
"trigram_stackexchange = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"([('new', 'york', 'city'),\n",
" ('inexpensively', 'new', 'york'),\n",
" ('wander', 'new', 'york'),\n",
" ('visit', 'new', 'york'),\n",
" ('la', 'new', 'york')],\n",
" [('new', 'york'),\n",
" ('san', 'francisco'),\n",
" ('road', 'trip'),\n",
" ('united', 'states'),\n",
" ('washington', 'dc')])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trigram_wiki[:5], bigram_wiki[:5]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"universal = trigram_wiki + bigram_wiki + trigram_quora + bigram_quora + trigram_stackexchange + bigram_stackexchange + trigram_reddit + bigram_reddit"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import spacy\n",
"nlp = spacy.load('en_core_web_lg')\n",
"import math\n",
"from subprocess import Popen, PIPE"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"vectors300 = [nlp(' '.join(i)).vector for i in universal]"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"23\n"
]
}
],
"source": [
"from sklearn.cluster import MeanShift\n",
"import numpy as np\n",
"import pandas as pd\n",
"shift = MeanShift(bandwidth=4, n_jobs=-1)\n",
"shift.fit(vectors300)\n",
"\n",
"meanshift_df = pd.DataFrame({\n",
" \"phrase\": universal,\n",
" \"label\": shift.labels_,\n",
" \"vector\": vectors300\n",
"})\n",
"print(len(shift.cluster_centers_))"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"No of points: 104\n",
"0 (new, york, city)\n",
"1 (inexpensively, new, york)\n",
"2 (wander, new, york)\n",
"3 (visit, new, york)\n",
"4 (la, new, york)\n",
"5 (bus, new, york)\n",
"6 (miami, new, york)\n",
"7 (experience, new, york)\n",
"8 (new, york, visit)\n",
"9 (new, york, cross)\n",
"10 (new, york, state)\n",
"11 (see, new, york)\n",
"12 (navigate, new, york)\n",
"13 (island, new, york)\n",
"14 (new, york, spend)\n",
"15 (francisco, new, york)\n",
"16 (tour, new, york)\n",
"17 (new, york, ride)\n",
"18 (tourist, new, york)\n",
"19 (trip, new, york)\n",
"20 (new, york)\n",
"22 (road, trip)\n",
"25 (get, married)\n",
"26 (york, city)\n",
"31 (time, trip)\n",
"32 (get, around)\n",
"33 (plan, trip)\n",
"37 (new, zealand)\n",
"39 (three, days)\n",
"40 (invited, someones, home)\n",
" ... \n",
"116 (green, card)\n",
"117 (public, transport)\n",
"118 (6, months)\n",
"120 (first, time, traveller)\n",
"121 (today, first, time)\n",
"122 (first, time, visiting)\n",
"123 (first, time, going)\n",
"124 (dealt, first, time)\n",
"125 (first, time, roadtrips)\n",
"126 (zion, national, park)\n",
"127 (chinese, first, time)\n",
"128 (first, time, dubai)\n",
"129 (start, first, time)\n",
"130 (alone, first, time)\n",
"131 (dui, first, time)\n",
"132 (night, first, time)\n",
"133 (europe, first, time)\n",
"134 (first, time, traveler)\n",
"135 (first, time, norway)\n",
"136 (canyon, first, time)\n",
"137 (first, time, 10)\n",
"138 (first, time, early)\n",
"139 (2019, first, time)\n",
"140 (first, time)\n",
"141 (national, park)\n",
"146 (grand, canyon)\n",
"147 (road, trip)\n",
"148 (new, zealand)\n",
"153 (need, advice)\n",
"159 (credit, card)\n",
"Name: phrase, Length: 104, dtype: object\n",
"No of points: 30\n",
"29 (visa, apply)\n",
"34 (tourist, visa)\n",
"80 (schengen, visa, application)\n",
"81 (schengen, visa, refusal)\n",
"82 (uk, schengen, visa)\n",
"83 (need, schengen, visa)\n",
"84 (apply, schengen, visa)\n",
"85 (applying, schengen, visa)\n",
"86 (schengen, visa, refused)\n",
"87 (schengen, visa, waiver)\n",
"88 (schengen, visa, duplicate)\n",
"89 (multipleentry, schengen, visa)\n",
"90 (schengen, visa, issued)\n",
"91 (schengen, visa, requirements)\n",
"92 (schengen, visa, required)\n",
"93 (schengen, visa, expires)\n",
"94 (singleentry, schengen, visa)\n",
"95 (get, schengen, visa)\n",
"96 (transit, visa, application)\n",
"97 (visa, schengen, visa)\n",
"98 (valid, schengen, visa)\n",
"99 (us, schengen, visa)\n",
"100 (schengen, visa)\n",
"101 (transit, visa)\n",
"102 (visitor, visa)\n",
"104 (visa, application)\n",
"105 (tourist, visa)\n",
"109 (schengen, area)\n",
"110 (uk, visa)\n",
"114 (visa, refusal)\n",
"Name: phrase, dtype: object\n",
"No of points: 3\n",
"28 (hong, kong)\n",
"103 (hong, kong)\n",
"144 (hong, kong)\n",
"Name: phrase, dtype: object\n",
"No of points: 2\n",
"23 (united, states)\n",
"111 (united, states)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"119 (connecting, flight)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"152 (renting, car)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"155 (cesky, krumlov)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"38 (hotel, room)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"30 (international, airport)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"158 (machu, picchu)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"106 (residence, permit)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"157 (isle, skye)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"156 (hidden, gems)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"154 (base, camp)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"151 (late, december)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"35 (oahu, hawaii)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"24 (washington, dc)\n",
"Name: phrase, dtype: object\n",
"No of points: 3\n",
"21 (san, francisco)\n",
"36 (los, angeles)\n",
"150 (san, francisco)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"149 (costa, rica)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"143 (cinque, terre)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"142 (sri, lanka)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"145 (global, entry)\n",
"Name: phrase, dtype: object\n",
"No of points: 1\n",
"27 (las, vegas)\n",
"Name: phrase, dtype: object\n"
]
}
],
"source": [
"for i in range(len(shift.cluster_centers_)):\n",
" print(\"No of points: \"+ str(np.sum(meanshift_df[\"label\"]==i)))\n",
" print(meanshift_df[meanshift_df[\"label\"]==i][\"phrase\"])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (GPUReady)",
"language": "python",
"name": "gpuready"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment