sharanry · November 22, 2018 10:09
diff --git a/cluster labelling.ipynb b/cluster labelling.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder\n",
    "from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures\n",
    "from nltk.corpus import stopwords\n",
    "stop_words = set(stopwords.words('english'))\n",
    "import nltk\n",
    "import csv\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('data/travel/quora.txt') as f:\n",
    "    words_quora = [word for line in f for word in line.split()]\n",
    "\n",
    "with open('data/travel/wikihow.txt') as f:\n",
    "    words_wiki = [re.sub(r'[^\\w\\s]', '', word) for line in f for word in line.split()]\n",
    "\n",
    "with open('data/travel/stackexchange.txt') as f:\n",
    "    words_stackexchange = [re.sub(r'[^\\w\\s]', '', word) for line in f for word in line.split()]\n",
    "\n",
    "with open('data/travel/reddit.txt') as f:\n",
    "    words_reddit = [re.sub(r'[^\\w\\s]', '', word) for line in f for word in line.split()]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "wiki_sentence = []\n",
    "for w in words_wiki:\n",
    "    if w.lower() not in stop_words:\n",
    "        wiki_sentence.append(w.lower())\n",
    "\n",
    "quora_sentence = []\n",
    "for w in words_quora:\n",
    "    if w.lower() not in stop_words:\n",
    "        quora_sentence.append(w.lower())\n",
    "\n",
    "stackexchange_sentence = []\n",
    "for w in words_stackexchange:\n",
    "    if w.lower() not in stop_words:\n",
    "        stackexchange_sentence.append(w.lower())\n",
    "\n",
    "reddit_sentence = []\n",
    "for w in words_reddit:\n",
    "    if w.lower() not in stop_words:\n",
    "        reddit_sentence.append(w.lower())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(['act', 'white', 'house', 'tour', 'arrange'],\n",
       " ['best', 'travel', 'website', 'spain', 'ever'],\n",
       " ['traveling', 'us', 'via', 'lhr', 'duplicate'],\n",
       " ['data', 'visualization', 'competition', 'rdataisbeautiful', 'rtravel'])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wiki_sentence[:5], quora_sentence[:5],stackexchange_sentence[:5], reddit_sentence[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "finder = TrigramCollocationFinder.from_words(wiki_sentence)\n",
    "trigram_wiki = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)\n",
    "finder = TrigramCollocationFinder.from_words(quora_sentence)\n",
    "trigram_quora = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)\n",
    "\n",
    "finder = BigramCollocationFinder.from_words(wiki_sentence)\n",
    "bigram_wiki = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)\n",
    "finder = BigramCollocationFinder.from_words(quora_sentence)\n",
    "bigram_quora = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)\n",
    "\n",
    "finder = BigramCollocationFinder.from_words(reddit_sentence)\n",
    "bigram_reddit = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)\n",
    "finder = BigramCollocationFinder.from_words(stackexchange_sentence)\n",
    "bigram_stackexchange = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)\n",
    "\n",
    "finder = TrigramCollocationFinder.from_words(reddit_sentence)\n",
    "trigram_reddit = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)\n",
    "finder = TrigramCollocationFinder.from_words(stackexchange_sentence)\n",
    "trigram_stackexchange = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([('new', 'york', 'city'),\n",
       "  ('inexpensively', 'new', 'york'),\n",
       "  ('wander', 'new', 'york'),\n",
       "  ('visit', 'new', 'york'),\n",
       "  ('la', 'new', 'york')],\n",
       " [('new', 'york'),\n",
       "  ('san', 'francisco'),\n",
       "  ('road', 'trip'),\n",
       "  ('united', 'states'),\n",
       "  ('washington', 'dc')])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trigram_wiki[:5], bigram_wiki[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "universal = trigram_wiki + bigram_wiki + trigram_quora + bigram_quora + trigram_stackexchange + bigram_stackexchange + trigram_reddit + bigram_reddit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import spacy\n",
    "nlp = spacy.load('en_core_web_lg')\n",
    "import math\n",
    "from subprocess import Popen, PIPE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectors300 = [nlp(' '.join(i)).vector for i in universal]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "23\n"
     ]
    }
   ],
   "source": [
    "from sklearn.cluster import MeanShift\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "shift = MeanShift(bandwidth=4, n_jobs=-1)\n",
    "shift.fit(vectors300)\n",
    "\n",
    "meanshift_df = pd.DataFrame({\n",
    "    \"phrase\": universal,\n",
    "    \"label\": shift.labels_,\n",
    "    \"vector\": vectors300\n",
    "})\n",
    "print(len(shift.cluster_centers_))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No of points: 104\n",
      "0               (new, york, city)\n",
      "1      (inexpensively, new, york)\n",
      "2             (wander, new, york)\n",
      "3              (visit, new, york)\n",
      "4                 (la, new, york)\n",
      "5                (bus, new, york)\n",
      "6              (miami, new, york)\n",
      "7         (experience, new, york)\n",
      "8              (new, york, visit)\n",
      "9              (new, york, cross)\n",
      "10             (new, york, state)\n",
      "11               (see, new, york)\n",
      "12          (navigate, new, york)\n",
      "13            (island, new, york)\n",
      "14             (new, york, spend)\n",
      "15         (francisco, new, york)\n",
      "16              (tour, new, york)\n",
      "17              (new, york, ride)\n",
      "18           (tourist, new, york)\n",
      "19              (trip, new, york)\n",
      "20                    (new, york)\n",
      "22                   (road, trip)\n",
      "25                 (get, married)\n",
      "26                   (york, city)\n",
      "31                   (time, trip)\n",
      "32                  (get, around)\n",
      "33                   (plan, trip)\n",
      "37                 (new, zealand)\n",
      "39                  (three, days)\n",
      "40      (invited, someones, home)\n",
      "                  ...            \n",
      "116                 (green, card)\n",
      "117           (public, transport)\n",
      "118                   (6, months)\n",
      "120      (first, time, traveller)\n",
      "121          (today, first, time)\n",
      "122       (first, time, visiting)\n",
      "123          (first, time, going)\n",
      "124          (dealt, first, time)\n",
      "125      (first, time, roadtrips)\n",
      "126        (zion, national, park)\n",
      "127        (chinese, first, time)\n",
      "128          (first, time, dubai)\n",
      "129          (start, first, time)\n",
      "130          (alone, first, time)\n",
      "131            (dui, first, time)\n",
      "132          (night, first, time)\n",
      "133         (europe, first, time)\n",
      "134       (first, time, traveler)\n",
      "135         (first, time, norway)\n",
      "136         (canyon, first, time)\n",
      "137             (first, time, 10)\n",
      "138          (first, time, early)\n",
      "139           (2019, first, time)\n",
      "140                 (first, time)\n",
      "141              (national, park)\n",
      "146               (grand, canyon)\n",
      "147                  (road, trip)\n",
      "148                (new, zealand)\n",
      "153                (need, advice)\n",
      "159                (credit, card)\n",
      "Name: phrase, Length: 104, dtype: object\n",
      "No of points: 30\n",
      "29                       (visa, apply)\n",
      "34                     (tourist, visa)\n",
      "80       (schengen, visa, application)\n",
      "81           (schengen, visa, refusal)\n",
      "82                (uk, schengen, visa)\n",
      "83              (need, schengen, visa)\n",
      "84             (apply, schengen, visa)\n",
      "85          (applying, schengen, visa)\n",
      "86           (schengen, visa, refused)\n",
      "87            (schengen, visa, waiver)\n",
      "88         (schengen, visa, duplicate)\n",
      "89     (multipleentry, schengen, visa)\n",
      "90            (schengen, visa, issued)\n",
      "91      (schengen, visa, requirements)\n",
      "92          (schengen, visa, required)\n",
      "93           (schengen, visa, expires)\n",
      "94       (singleentry, schengen, visa)\n",
      "95               (get, schengen, visa)\n",
      "96        (transit, visa, application)\n",
      "97              (visa, schengen, visa)\n",
      "98             (valid, schengen, visa)\n",
      "99                (us, schengen, visa)\n",
      "100                   (schengen, visa)\n",
      "101                    (transit, visa)\n",
      "102                    (visitor, visa)\n",
      "104                (visa, application)\n",
      "105                    (tourist, visa)\n",
      "109                   (schengen, area)\n",
      "110                         (uk, visa)\n",
      "114                    (visa, refusal)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 3\n",
      "28     (hong, kong)\n",
      "103    (hong, kong)\n",
      "144    (hong, kong)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 2\n",
      "23     (united, states)\n",
      "111    (united, states)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "119    (connecting, flight)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "152    (renting, car)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "155    (cesky, krumlov)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "38    (hotel, room)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "30    (international, airport)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "158    (machu, picchu)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "106    (residence, permit)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "157    (isle, skye)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "156    (hidden, gems)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "154    (base, camp)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "151    (late, december)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "35    (oahu, hawaii)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "24    (washington, dc)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 3\n",
      "21     (san, francisco)\n",
      "36       (los, angeles)\n",
      "150    (san, francisco)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "149    (costa, rica)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "143    (cinque, terre)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "142    (sri, lanka)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "145    (global, entry)\n",
      "Name: phrase, dtype: object\n",
      "No of points: 1\n",
      "27    (las, vegas)\n",
      "Name: phrase, dtype: object\n"
     ]
    }
   ],
   "source": [
    "for i in range(len(shift.cluster_centers_)):\n",
    "    print(\"No of points: \"+ str(np.sum(meanshift_df[\"label\"]==i)))\n",
    "    print(meanshift_df[meanshift_df[\"label\"]==i][\"phrase\"])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (GPUReady)",
   "language": "python",
   "name": "gpuready"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import nltk\n",
	"nltk.download('stopwords')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder\n",
	"from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures\n",
	"from nltk.corpus import stopwords\n",
	"stop_words = set(stopwords.words('english'))\n",
	"import nltk\n",
	"import csv\n",
	"import re"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"with open('data/travel/quora.txt') as f:\n",
	" words_quora = [word for line in f for word in line.split()]\n",
	"\n",
	"with open('data/travel/wikihow.txt') as f:\n",
	" words_wiki = [re.sub(r'[^\\w\\s]', '', word) for line in f for word in line.split()]\n",
	"\n",
	"with open('data/travel/stackexchange.txt') as f:\n",
	" words_stackexchange = [re.sub(r'[^\\w\\s]', '', word) for line in f for word in line.split()]\n",
	"\n",
	"with open('data/travel/reddit.txt') as f:\n",
	" words_reddit = [re.sub(r'[^\\w\\s]', '', word) for line in f for word in line.split()]\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"wiki_sentence = []\n",
	"for w in words_wiki:\n",
	" if w.lower() not in stop_words:\n",
	" wiki_sentence.append(w.lower())\n",
	"\n",
	"quora_sentence = []\n",
	"for w in words_quora:\n",
	" if w.lower() not in stop_words:\n",
	" quora_sentence.append(w.lower())\n",
	"\n",
	"stackexchange_sentence = []\n",
	"for w in words_stackexchange:\n",
	" if w.lower() not in stop_words:\n",
	" stackexchange_sentence.append(w.lower())\n",
	"\n",
	"reddit_sentence = []\n",
	"for w in words_reddit:\n",
	" if w.lower() not in stop_words:\n",
	" reddit_sentence.append(w.lower())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(['act', 'white', 'house', 'tour', 'arrange'],\n",
	" ['best', 'travel', 'website', 'spain', 'ever'],\n",
	" ['traveling', 'us', 'via', 'lhr', 'duplicate'],\n",
	" ['data', 'visualization', 'competition', 'rdataisbeautiful', 'rtravel'])"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"wiki_sentence[:5], quora_sentence[:5],stackexchange_sentence[:5], reddit_sentence[:5]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"finder = TrigramCollocationFinder.from_words(wiki_sentence)\n",
	"trigram_wiki = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)\n",
	"finder = TrigramCollocationFinder.from_words(quora_sentence)\n",
	"trigram_quora = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)\n",
	"\n",
	"finder = BigramCollocationFinder.from_words(wiki_sentence)\n",
	"bigram_wiki = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)\n",
	"finder = BigramCollocationFinder.from_words(quora_sentence)\n",
	"bigram_quora = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)\n",
	"\n",
	"finder = BigramCollocationFinder.from_words(reddit_sentence)\n",
	"bigram_reddit = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)\n",
	"finder = BigramCollocationFinder.from_words(stackexchange_sentence)\n",
	"bigram_stackexchange = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)\n",
	"\n",
	"finder = TrigramCollocationFinder.from_words(reddit_sentence)\n",
	"trigram_reddit = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)\n",
	"finder = TrigramCollocationFinder.from_words(stackexchange_sentence)\n",
	"trigram_stackexchange = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"([('new', 'york', 'city'),\n",
	" ('inexpensively', 'new', 'york'),\n",
	" ('wander', 'new', 'york'),\n",
	" ('visit', 'new', 'york'),\n",
	" ('la', 'new', 'york')],\n",
	" [('new', 'york'),\n",
	" ('san', 'francisco'),\n",
	" ('road', 'trip'),\n",
	" ('united', 'states'),\n",
	" ('washington', 'dc')])"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"trigram_wiki[:5], bigram_wiki[:5]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"universal = trigram_wiki + bigram_wiki + trigram_quora + bigram_quora + trigram_stackexchange + bigram_stackexchange + trigram_reddit + bigram_reddit"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"import spacy\n",
	"nlp = spacy.load('en_core_web_lg')\n",
	"import math\n",
	"from subprocess import Popen, PIPE"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [],
	"source": [
	"vectors300 = [nlp(' '.join(i)).vector for i in universal]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 58,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"23\n"
	]
	}
	],
	"source": [
	"from sklearn.cluster import MeanShift\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"shift = MeanShift(bandwidth=4, n_jobs=-1)\n",
	"shift.fit(vectors300)\n",
	"\n",
	"meanshift_df = pd.DataFrame({\n",
	" \"phrase\": universal,\n",
	" \"label\": shift.labels_,\n",
	" \"vector\": vectors300\n",
	"})\n",
	"print(len(shift.cluster_centers_))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 60,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"No of points: 104\n",
	"0 (new, york, city)\n",
	"1 (inexpensively, new, york)\n",
	"2 (wander, new, york)\n",
	"3 (visit, new, york)\n",
	"4 (la, new, york)\n",
	"5 (bus, new, york)\n",
	"6 (miami, new, york)\n",
	"7 (experience, new, york)\n",
	"8 (new, york, visit)\n",
	"9 (new, york, cross)\n",
	"10 (new, york, state)\n",
	"11 (see, new, york)\n",
	"12 (navigate, new, york)\n",
	"13 (island, new, york)\n",
	"14 (new, york, spend)\n",
	"15 (francisco, new, york)\n",
	"16 (tour, new, york)\n",
	"17 (new, york, ride)\n",
	"18 (tourist, new, york)\n",
	"19 (trip, new, york)\n",
	"20 (new, york)\n",
	"22 (road, trip)\n",
	"25 (get, married)\n",
	"26 (york, city)\n",
	"31 (time, trip)\n",
	"32 (get, around)\n",
	"33 (plan, trip)\n",
	"37 (new, zealand)\n",
	"39 (three, days)\n",
	"40 (invited, someones, home)\n",
	" ... \n",
	"116 (green, card)\n",
	"117 (public, transport)\n",
	"118 (6, months)\n",
	"120 (first, time, traveller)\n",
	"121 (today, first, time)\n",
	"122 (first, time, visiting)\n",
	"123 (first, time, going)\n",
	"124 (dealt, first, time)\n",
	"125 (first, time, roadtrips)\n",
	"126 (zion, national, park)\n",
	"127 (chinese, first, time)\n",
	"128 (first, time, dubai)\n",
	"129 (start, first, time)\n",
	"130 (alone, first, time)\n",
	"131 (dui, first, time)\n",
	"132 (night, first, time)\n",
	"133 (europe, first, time)\n",
	"134 (first, time, traveler)\n",
	"135 (first, time, norway)\n",
	"136 (canyon, first, time)\n",
	"137 (first, time, 10)\n",
	"138 (first, time, early)\n",
	"139 (2019, first, time)\n",
	"140 (first, time)\n",
	"141 (national, park)\n",
	"146 (grand, canyon)\n",
	"147 (road, trip)\n",
	"148 (new, zealand)\n",
	"153 (need, advice)\n",
	"159 (credit, card)\n",
	"Name: phrase, Length: 104, dtype: object\n",
	"No of points: 30\n",
	"29 (visa, apply)\n",
	"34 (tourist, visa)\n",
	"80 (schengen, visa, application)\n",
	"81 (schengen, visa, refusal)\n",
	"82 (uk, schengen, visa)\n",
	"83 (need, schengen, visa)\n",
	"84 (apply, schengen, visa)\n",
	"85 (applying, schengen, visa)\n",
	"86 (schengen, visa, refused)\n",
	"87 (schengen, visa, waiver)\n",
	"88 (schengen, visa, duplicate)\n",
	"89 (multipleentry, schengen, visa)\n",
	"90 (schengen, visa, issued)\n",
	"91 (schengen, visa, requirements)\n",
	"92 (schengen, visa, required)\n",
	"93 (schengen, visa, expires)\n",
	"94 (singleentry, schengen, visa)\n",
	"95 (get, schengen, visa)\n",
	"96 (transit, visa, application)\n",
	"97 (visa, schengen, visa)\n",
	"98 (valid, schengen, visa)\n",
	"99 (us, schengen, visa)\n",
	"100 (schengen, visa)\n",
	"101 (transit, visa)\n",
	"102 (visitor, visa)\n",
	"104 (visa, application)\n",
	"105 (tourist, visa)\n",
	"109 (schengen, area)\n",
	"110 (uk, visa)\n",
	"114 (visa, refusal)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 3\n",
	"28 (hong, kong)\n",
	"103 (hong, kong)\n",
	"144 (hong, kong)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 2\n",
	"23 (united, states)\n",
	"111 (united, states)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"119 (connecting, flight)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"152 (renting, car)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"155 (cesky, krumlov)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"38 (hotel, room)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"30 (international, airport)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"158 (machu, picchu)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"106 (residence, permit)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"157 (isle, skye)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"156 (hidden, gems)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"154 (base, camp)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"151 (late, december)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"35 (oahu, hawaii)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"24 (washington, dc)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 3\n",
	"21 (san, francisco)\n",
	"36 (los, angeles)\n",
	"150 (san, francisco)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"149 (costa, rica)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"143 (cinque, terre)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"142 (sri, lanka)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"145 (global, entry)\n",
	"Name: phrase, dtype: object\n",
	"No of points: 1\n",
	"27 (las, vegas)\n",
	"Name: phrase, dtype: object\n"
	]
	}
	],
	"source": [
	"for i in range(len(shift.cluster_centers_)):\n",
	" print(\"No of points: \"+ str(np.sum(meanshift_df[\"label\"]==i)))\n",
	" print(meanshift_df[meanshift_df[\"label\"]==i][\"phrase\"])"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python (GPUReady)",
	"language": "python",
	"name": "gpuready"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}