Instantly share code, notes, and snippets.
Created
January 18, 2017 10:03
-
Star
2
(2)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
-
Save mizvol/eb24770ac3d5d598463f972e2a669f03 to your computer and use it in GitHub Desktop.
LDA topic analysis of Instagram hashtags for clustering. Analysis + Visualization in D3JS
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Clustering Instagram users using hashtags. Topic analysis and visualization in D3JS" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import pymongo as pm\n", | |
| "import unicodedata" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Reading the data from Mongo" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "client = pm.MongoClient()\n", | |
| "db = client.instagram\n", | |
| "tagsDB = db.tags" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Extracting tags data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "rawTags = []\n", | |
| "for user in tagsDB.find():\n", | |
| " rawTags.extend(user['tags'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "424113" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "len(rawTags)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[u'contiki',\n", | |
| " u'swissalps',\n", | |
| " u'newfriends',\n", | |
| " u'freezingmynutsoff',\n", | |
| " u'walkabout',\n", | |
| " u'jungfraujoch',\n", | |
| " u'yolo',\n", | |
| " u'travel',\n", | |
| " u'noregrets',\n", | |
| " u'goodtimes']" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "rawTags[:10]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "tagsRDD = sc.parallelize(rawTags)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "424113" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "tagsRDD.count()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Cleaning\n", | |
| "Note, if you want to keep language specific features and words, you have to clean the data in a different way." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "countsRDD = (\n", | |
| " tagsRDD\n", | |
| " .map(lambda tag: (unicodedata.normalize('NFKD', tag).encode('ascii','ignore'), 1))\n", | |
| " .reduceByKey(lambda a, b: a + b)\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "106083" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "countsRDD.count()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Explore the data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "ordered = countsRDD.takeOrdered(500, lambda (key, value): -value)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('', 9146),\n", | |
| " ('switzerland', 8973),\n", | |
| " ('zurich', 3990),\n", | |
| " ('love', 2605),\n", | |
| " ('swiss', 2593),\n", | |
| " ('easter', 2526),\n", | |
| " ('mountains', 1999),\n", | |
| " ('travel', 1978),\n", | |
| " ('spring', 1963),\n", | |
| " ('snow', 1809),\n", | |
| " ('sun', 1754),\n", | |
| " ('nature', 1742),\n", | |
| " ('lake', 1565),\n", | |
| " ('geneva', 1516),\n", | |
| " ('beautiful', 1485),\n", | |
| " ('schweiz', 1462),\n", | |
| " ('happy', 1408),\n", | |
| " ('instagood', 1403),\n", | |
| " ('photooftheday', 1265),\n", | |
| " ('picoftheday', 1256),\n", | |
| " ('suisse', 1238),\n", | |
| " ('friends', 1199),\n", | |
| " ('alps', 1165),\n", | |
| " ('happyeaster', 1096),\n", | |
| " ('ski', 1085),\n", | |
| " ('fun', 1044),\n", | |
| " ('basel', 994),\n", | |
| " ('landscape', 928),\n", | |
| " ('sky', 923),\n", | |
| " ('skiing', 860),\n", | |
| " ('sunset', 839),\n", | |
| " ('swissalps', 820),\n", | |
| " ('family', 805),\n", | |
| " ('bern', 800),\n", | |
| " ('nofilter', 799),\n", | |
| " ('luzern', 770),\n", | |
| " ('amazing', 755),\n", | |
| " ('view', 754),\n", | |
| " ('europe', 737),\n", | |
| " ('instadaily', 736),\n", | |
| " ('weekend', 713),\n", | |
| " ('geneve', 701),\n", | |
| " ('fashion', 700),\n", | |
| " ('art', 680),\n", | |
| " ('holiday', 678),\n", | |
| " ('sunnyday', 666),\n", | |
| " ('me', 659),\n", | |
| " ('mountain', 657),\n", | |
| " ('food', 648),\n", | |
| " ('lausanne', 589),\n", | |
| " ('instalike', 583),\n", | |
| " ('smile', 577),\n", | |
| " ('style', 572),\n", | |
| " ('like4like', 565),\n", | |
| " ('lucerne', 565),\n", | |
| " ('followme', 563),\n", | |
| " ('clouds', 561),\n", | |
| " ('architecture', 546),\n", | |
| " ('blue', 544),\n", | |
| " ('wanderlust', 540),\n", | |
| " ('zermatt', 537),\n", | |
| " ('instatravel', 536),\n", | |
| " ('selfie', 535),\n", | |
| " ('instamood', 533),\n", | |
| " ('life', 532),\n", | |
| " ('winter', 529),\n", | |
| " ('city', 519),\n", | |
| " ('ostern', 515),\n", | |
| " ('trip', 515),\n", | |
| " ('sunny', 513),\n", | |
| " ('photography', 512),\n", | |
| " ('flowers', 501),\n", | |
| " ('blackandwhite', 497),\n", | |
| " ('baselworld2016', 496),\n", | |
| " ('travelgram', 492),\n", | |
| " ('bluesky', 492),\n", | |
| " ('sunshine', 491),\n", | |
| " ('instagram', 485),\n", | |
| " ('girl', 478),\n", | |
| " ('foodporn', 476),\n", | |
| " ('home', 471),\n", | |
| " ('verbier', 471),\n", | |
| " ('party', 462),\n", | |
| " ('igers', 462),\n", | |
| " ('music', 458),\n", | |
| " ('vscocam', 451),\n", | |
| " ('beauty', 443),\n", | |
| " ('tbt', 436),\n", | |
| " ('montreux', 435),\n", | |
| " ('vsco', 429),\n", | |
| " ('luxury', 426),\n", | |
| " ('instapic', 424),\n", | |
| " ('baselworld', 422),\n", | |
| " ('fitness', 422),\n", | |
| " ('follow', 422),\n", | |
| " ('svizzera', 420),\n", | |
| " ('chocolate', 418),\n", | |
| " ('visitswitzerland', 407),\n", | |
| " ('lifestyle', 397),\n", | |
| " ('snowboarding', 396),\n", | |
| " ('night', 395),\n", | |
| " ('relax', 385),\n", | |
| " ('matterhorn', 380),\n", | |
| " ('lacleman', 378),\n", | |
| " ('photo', 372),\n", | |
| " ('water', 370),\n", | |
| " ('lugano', 365),\n", | |
| " ('holidays', 362),\n", | |
| " ('interlaken', 357),\n", | |
| " ('myswitzerland', 356),\n", | |
| " ('vacation', 354),\n", | |
| " ('design', 349),\n", | |
| " ('switzerlandwonderland', 348),\n", | |
| " ('summer', 348),\n", | |
| " ('goodtimes', 346),\n", | |
| " ('stmoritz', 342),\n", | |
| " ('morning', 341),\n", | |
| " ('day', 340),\n", | |
| " ('cute', 340),\n", | |
| " ('2016', 333),\n", | |
| " ('traveling', 332),\n", | |
| " ('enjoy', 328),\n", | |
| " ('tagsforlikes', 326),\n", | |
| " ('sunday', 322),\n", | |
| " ('tree', 322),\n", | |
| " ('saturday', 320),\n", | |
| " ('green', 319),\n", | |
| " ('bestoftheday', 314),\n", | |
| " ('goodmorning', 314),\n", | |
| " ('loveit', 309),\n", | |
| " ('travelling', 309),\n", | |
| " ('instafood', 297),\n", | |
| " ('river', 295),\n", | |
| " ('happiness', 294),\n", | |
| " ('white', 294),\n", | |
| " ('hiking', 292),\n", | |
| " ('nice', 291),\n", | |
| " ('germany', 287),\n", | |
| " ('snowboard', 285),\n", | |
| " ('coffee', 281),\n", | |
| " ('france', 281),\n", | |
| " ('konstanz', 279),\n", | |
| " ('inlovewithswitzerland', 276),\n", | |
| " ('black', 276),\n", | |
| " ('ticino', 274),\n", | |
| " ('follow4follow', 271),\n", | |
| " ('valais', 269),\n", | |
| " ('healthy', 265),\n", | |
| " ('instacool', 264),\n", | |
| " ('work', 264),\n", | |
| " ('adventure', 263),\n", | |
| " ('watch', 257),\n", | |
| " ('sport', 257),\n", | |
| " ('trees', 256),\n", | |
| " ('likeforlike', 256),\n", | |
| " ('zuri', 255),\n", | |
| " ('bodensee', 255),\n", | |
| " ('awesome', 254),\n", | |
| " ('watches', 253),\n", | |
| " ('_', 251),\n", | |
| " ('springtime', 250),\n", | |
| " ('light', 246),\n", | |
| " ('italy', 246),\n", | |
| " ('paques', 246),\n", | |
| " ('fruhling', 244),\n", | |
| " ('yummy', 243),\n", | |
| " ('street', 243),\n", | |
| " ('breakfast', 242),\n", | |
| " ('graubunden', 242),\n", | |
| " ('train', 241),\n", | |
| " ('naturelovers', 241),\n", | |
| " ('dinner', 240),\n", | |
| " ('explore', 240),\n", | |
| " ('davos', 239),\n", | |
| " ('best', 236),\n", | |
| " ('swissmade', 236),\n", | |
| " ('girls', 234),\n", | |
| " ('red', 234),\n", | |
| " ('peace', 230),\n", | |
| " ('laax', 229),\n", | |
| " ('travelingram', 229),\n", | |
| " ('sunrise', 227),\n", | |
| " ('chill', 226),\n", | |
| " ('like', 224),\n", | |
| " ('workout', 223),\n", | |
| " ('panorama', 223),\n", | |
| " ('switzerlandpictures', 221),\n", | |
| " ('cool', 221),\n", | |
| " ('gopro', 219),\n", | |
| " ('ootd', 217),\n", | |
| " ('delicious', 216),\n", | |
| " ('beautifulday', 215),\n", | |
| " ('zurichsee', 215),\n", | |
| " ('sweet', 215),\n", | |
| " ('model', 214),\n", | |
| " ('throwback', 213),\n", | |
| " ('ig_switzerland', 211),\n", | |
| " ('photographer', 210),\n", | |
| " ('car', 210),\n", | |
| " ('dog', 209),\n", | |
| " ('suiza', 205),\n", | |
| " ('beautifuldestinations', 205),\n", | |
| " ('see', 205),\n", | |
| " ('colorful', 205),\n", | |
| " ('walk', 204),\n", | |
| " ('colors', 204),\n", | |
| " ('lunch', 204),\n", | |
| " ('new', 202),\n", | |
| " ('training', 202),\n", | |
| " ('live', 201),\n", | |
| " ('gym', 201),\n", | |
| " ('foodie', 200),\n", | |
| " ('forest', 200),\n", | |
| " ('motivation', 198),\n", | |
| " ('cold', 197),\n", | |
| " ('world', 195),\n", | |
| " ('beer', 194),\n", | |
| " ('ischgl', 194),\n", | |
| " ('familytime', 192),\n", | |
| " ('castle', 192),\n", | |
| " ('pasqua', 190),\n", | |
| " ('running', 190),\n", | |
| " ('fit', 188),\n", | |
| " ('switzerland_vacations', 187),\n", | |
| " ('restaurant', 187),\n", | |
| " ('good', 187),\n", | |
| " ('pink', 186),\n", | |
| " ('bunny', 186),\n", | |
| " ('roadtrip', 186),\n", | |
| " ('homesweethome', 186),\n", | |
| " ('time', 185),\n", | |
| " ('my', 184),\n", | |
| " ('l4l', 184),\n", | |
| " ('picture', 183),\n", | |
| " ('memories', 183),\n", | |
| " ('lakegeneva', 181),\n", | |
| " ('nike', 181),\n", | |
| " ('alpes', 180),\n", | |
| " ('inspiration', 180),\n", | |
| " ('nikon', 179),\n", | |
| " ('tb', 178),\n", | |
| " ('instalove', 177),\n", | |
| " ('sonne', 176),\n", | |
| " ('grindelwald', 173),\n", | |
| " ('church', 171),\n", | |
| " ('canon', 171),\n", | |
| " ('weather', 171),\n", | |
| " ('travelphotography', 168),\n", | |
| " ('engelberg', 168),\n", | |
| " ('repost', 167),\n", | |
| " ('concert', 166),\n", | |
| " ('goodlife', 166),\n", | |
| " ('tattoo', 165),\n", | |
| " ('neverstopexploring', 164),\n", | |
| " ('engadin', 164),\n", | |
| " ('bridge', 164),\n", | |
| " ('with', 163),\n", | |
| " ('pretty', 163),\n", | |
| " ('iloveswitzerland', 163),\n", | |
| " ('instaphoto', 162),\n", | |
| " ('lovely', 162),\n", | |
| " ('watchporn', 162),\n", | |
| " ('jungfrau', 161),\n", | |
| " ('passion', 161),\n", | |
| " ('wallis', 160),\n", | |
| " ('airport', 159),\n", | |
| " ('perfect', 158),\n", | |
| " ('hotel', 158),\n", | |
| " ('tourism', 157),\n", | |
| " ('shopping', 156),\n", | |
| " ('friendship', 156),\n", | |
| " ('funny', 155),\n", | |
| " ('monday', 154),\n", | |
| " ('easterweekend', 153),\n", | |
| " ('swag', 152),\n", | |
| " ('instamoment', 152),\n", | |
| " ('flower', 152),\n", | |
| " ('berge', 150),\n", | |
| " ('the', 150),\n", | |
| " ('froheostern', 150),\n", | |
| " ('thun', 150),\n", | |
| " ('iphoneonly', 149),\n", | |
| " ('mylove', 147),\n", | |
| " ('pic', 147),\n", | |
| " ('skyporn', 147),\n", | |
| " ('bar', 147),\n", | |
| " ('brunch', 146),\n", | |
| " ('neuchatel', 145),\n", | |
| " ('loveswitzerlandcontest', 145),\n", | |
| " ('powder', 144),\n", | |
| " ('fresh', 143),\n", | |
| " ('evening', 143),\n", | |
| " ('makeup', 143),\n", | |
| " ('boy', 142),\n", | |
| " ('hair', 142),\n", | |
| " ('vegan', 142),\n", | |
| " ('hot', 142),\n", | |
| " ('wonderful', 141),\n", | |
| " ('in', 141),\n", | |
| " ('color', 140),\n", | |
| " ('house', 140),\n", | |
| " ('tourist', 140),\n", | |
| " ('instafollow', 140),\n", | |
| " ('eurotrip', 139),\n", | |
| " ('swizerland', 139),\n", | |
| " ('top', 139),\n", | |
| " ('friday', 139),\n", | |
| " ('lago', 139),\n", | |
| " ('traveller', 138),\n", | |
| " ('suica', 138),\n", | |
| " ('pictureoftheday', 138),\n", | |
| " ('instago', 138),\n", | |
| " ('eggs', 138),\n", | |
| " ('gold', 138),\n", | |
| " ('potd', 138),\n", | |
| " ('rhein', 138),\n", | |
| " ('polymanga', 137),\n", | |
| " ('wine', 137),\n", | |
| " ('stgallen', 136),\n", | |
| " ('rolex', 136),\n", | |
| " ('mood', 136),\n", | |
| " ('austria', 136),\n", | |
| " ('dance', 136),\n", | |
| " ('swan', 136),\n", | |
| " ('fribourg', 135),\n", | |
| " ('goodday', 134),\n", | |
| " ('alpen', 134),\n", | |
| " ('igdaily', 134),\n", | |
| " ('printemps', 134),\n", | |
| " ('swisslife', 134),\n", | |
| " ('lamborghini', 134),\n", | |
| " ('paris', 133),\n", | |
| " ('apresski', 133),\n", | |
| " ('march', 133),\n", | |
| " ('boat', 132),\n", | |
| " ('schnee', 132),\n", | |
| " ('portrait', 132),\n", | |
| " ('oldtown', 132),\n", | |
| " ('crazy', 132),\n", | |
| " ('lac', 132),\n", | |
| " ('birthday', 132),\n", | |
| " ('urban', 131),\n", | |
| " ('f4f', 131),\n", | |
| " ('arosa', 130),\n", | |
| " ('tflers', 130),\n", | |
| " ('latergram', 130),\n", | |
| " ('swissmountains', 129),\n", | |
| " ('winterwonderland', 128),\n", | |
| " ('vaud', 128),\n", | |
| " ('jetdeau', 128),\n", | |
| " ('streetart', 128),\n", | |
| " ('cat', 128),\n", | |
| " ('bmw', 127),\n", | |
| " ('ig_europe', 126),\n", | |
| " ('titlis', 126),\n", | |
| " ('look', 125),\n", | |
| " ('fitfam', 125),\n", | |
| " ('enjoylife', 125),\n", | |
| " ('and', 124),\n", | |
| " ('traveltheworld', 124),\n", | |
| " ('blessed', 124),\n", | |
| " ('paradise', 124),\n", | |
| " ('montagne', 123),\n", | |
| " ('outdoors', 123),\n", | |
| " ('ig_swiss', 122),\n", | |
| " ('vevey', 122),\n", | |
| " ('dessert', 122),\n", | |
| " ('couple', 121),\n", | |
| " ('sunglasses', 121),\n", | |
| " ('bike', 121),\n", | |
| " ('zug', 120),\n", | |
| " ('winterthur', 120),\n", | |
| " ('hiphop', 120),\n", | |
| " ('cars', 119),\n", | |
| " ('baby', 119),\n", | |
| " ('club', 118),\n", | |
| " ('animal', 118),\n", | |
| " ('ferrari', 117),\n", | |
| " ('vintage', 117),\n", | |
| " ('natur', 116),\n", | |
| " ('friend', 116),\n", | |
| " ('museum', 116),\n", | |
| " ('qualitytime', 116),\n", | |
| " ('carporn', 116),\n", | |
| " ('goodvibes', 116),\n", | |
| " ('loveyou', 115),\n", | |
| " ('wood', 115),\n", | |
| " ('igtravel', 115),\n", | |
| " ('lindt', 115),\n", | |
| " ('instagramers', 115),\n", | |
| " ('italia', 114),\n", | |
| " ('goodtime', 114),\n", | |
| " ('buonapasqua', 114),\n", | |
| " ('nature_perfection', 114),\n", | |
| " ('fly', 113),\n", | |
| " ('nofilterneeded', 113),\n", | |
| " ('today', 113),\n", | |
| " ('audi', 113),\n", | |
| " ('bw', 112),\n", | |
| " ('eat', 112),\n", | |
| " ('shooting', 112),\n", | |
| " ('watchesofinstagram', 111),\n", | |
| " ('walking', 111),\n", | |
| " ('supercar', 110),\n", | |
| " ('igerssuisse', 110),\n", | |
| " ('lakezurich', 110),\n", | |
| " ('garden', 110),\n", | |
| " ('likes', 110),\n", | |
| " ('great', 110),\n", | |
| " ('dj', 109),\n", | |
| " ('traveler', 109),\n", | |
| " ('super_switzerland', 109),\n", | |
| " ('yellow', 109),\n", | |
| " ('artist', 109),\n", | |
| " ('porsche', 109),\n", | |
| " ('landscape_lovers', 108),\n", | |
| " ('drinks', 108),\n", | |
| " ('happyday', 108),\n", | |
| " ('handmade', 108),\n", | |
| " ('run', 108),\n", | |
| " ('naturephotography', 108),\n", | |
| " ('goodnight', 107),\n", | |
| " ('vierwaldstattersee', 107),\n", | |
| " ('people', 106),\n", | |
| " ('blonde', 106),\n", | |
| " ('visitzurich', 106),\n", | |
| " ('london', 105),\n", | |
| " ('cheese', 105),\n", | |
| " ('easterbunny', 105),\n", | |
| " ('outdoor', 105),\n", | |
| " ('fondue', 105),\n", | |
| " ('ascona', 104),\n", | |
| " ('followforfollow', 104),\n", | |
| " ('watchoftheday', 104),\n", | |
| " ('leman', 103),\n", | |
| " ('lagomaggiore', 102),\n", | |
| " ('streetphotography', 102),\n", | |
| " ('reflection', 102),\n", | |
| " ('lights', 101),\n", | |
| " ('building', 101),\n", | |
| " ('ice', 101),\n", | |
| " ('iphone', 100),\n", | |
| " ('genevalake', 100),\n", | |
| " ('health', 100),\n", | |
| " ('freeride', 100),\n", | |
| " ('bodybuilding', 99),\n", | |
| " ('igersgeneva', 99),\n", | |
| " ('champagne', 99),\n", | |
| " ('waterfall', 99),\n", | |
| " ('beard', 99),\n", | |
| " ('chilling', 98),\n", | |
| " ('cloudporn', 98),\n", | |
| " ('sister', 98),\n", | |
| " ('primavera', 97),\n", | |
| " ('dream', 97),\n", | |
| " ('starbucks', 97),\n", | |
| " ('instafashion', 97),\n", | |
| " ('aviation', 97),\n", | |
| " ('springbreak', 97),\n", | |
| " ('rheinfall', 97),\n", | |
| " ('dogsofinstagram', 97),\n", | |
| " ('polymanga2016', 96),\n", | |
| " ('liechtenstein', 96),\n", | |
| " ('zurichcity', 96),\n", | |
| " ('igersswitzerland', 95),\n", | |
| " ('blogger', 95),\n", | |
| " ('instanature', 95),\n", | |
| " ('scenery', 95),\n", | |
| " ('schaffhausen', 95),\n", | |
| " ('outfit', 95),\n", | |
| " ('horology', 94),\n", | |
| " ('liveauthentic', 94),\n", | |
| " ('shoes', 94),\n", | |
| " ('nightlife', 94),\n", | |
| " ('animals', 94),\n", | |
| " ('adidas', 93),\n", | |
| " ('interiordesign', 93),\n", | |
| " ('instatraveling', 93),\n", | |
| " ('jewelry', 92),\n", | |
| " ('homemade', 92),\n", | |
| " ('cake', 92),\n", | |
| " ('tasty', 92),\n", | |
| " ('nightout', 92),\n", | |
| " ('wow', 91),\n", | |
| " ('zurichairport', 91),\n", | |
| " ('gstaad', 91),\n", | |
| " ('mylife', 91),\n", | |
| " ('rigi', 91),\n", | |
| " ('video', 91),\n", | |
| " ('mercedes', 91),\n", | |
| " ('a', 91),\n", | |
| " ('all_shots', 90),\n", | |
| " ('flying', 90),\n", | |
| " ('moment', 90),\n", | |
| " ('deutschland', 89),\n", | |
| " ('zurisee', 89),\n", | |
| " ('mytravelgram', 89),\n", | |
| " ('forever', 89),\n", | |
| " ('beach', 89),\n", | |
| " ('park', 88)]" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "ordered" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " , switzerland , zurich , love , swiss , easter , mountains , travel , spring , snow , sun , nature , lake , geneva , beautiful , schweiz , happy , instagood , photooftheday , picoftheday , suisse , friends , alps , happyeaster , ski , fun , basel , landscape , sky , skiing , sunset , swissalps , family , bern , nofilter , luzern , amazing , view , europe , instadaily , weekend , geneve , fashion , art , holiday , sunnyday , me , mountain , food , lausanne , instalike , smile , style , like4like , lucerne , followme , clouds , architecture , blue , wanderlust , zermatt , instatravel , selfie , instamood , life , winter , city , ostern , trip , sunny , photography , flowers , blackandwhite , baselworld2016 , travelgram , bluesky , sunshine , instagram , girl , foodporn , home , verbier , party , igers , music , vscocam , beauty , tbt , montreux , vsco , luxury , instapic , baselworld , fitness , follow , svizzera , chocolate , visitswitzerland , lifestyle , snowboarding , night , relax , matterhorn , lacleman , photo , water , lugano , holidays , interlaken , myswitzerland , vacation , design , switzerlandwonderland , summer , goodtimes , stmoritz , morning , day , cute , 2016 , traveling , enjoy , tagsforlikes , sunday , tree , saturday , green , bestoftheday , goodmorning , loveit , travelling , instafood , river , happiness , white , hiking , nice , germany , snowboard , coffee , france , konstanz , inlovewithswitzerland , black , ticino , follow4follow , valais , healthy , instacool , work , adventure , watch , sport , trees , likeforlike , zuri , bodensee , awesome , watches , _ , springtime , light , italy , paques , fruhling , yummy , street , breakfast , graubunden , train , naturelovers , dinner , explore , davos , best , swissmade , girls , red , peace , laax , travelingram , sunrise , chill , like , workout , panorama , switzerlandpictures , cool , gopro , ootd , delicious , beautifulday , zurichsee , sweet , model , throwback , ig_switzerland , photographer , car , dog , suiza , beautifuldestinations , see , colorful , walk , colors , lunch , new , training , live , gym , foodie , forest , motivation , cold , world , beer , ischgl , familytime , castle , pasqua , running , fit , switzerland_vacations , restaurant , good , pink , bunny , roadtrip , homesweethome , time , my , l4l , picture , memories , lakegeneva , nike , alpes , inspiration , nikon , tb , instalove , sonne , grindelwald , church , canon , weather , travelphotography , engelberg , repost , concert , goodlife , tattoo , neverstopexploring , engadin , bridge , with , pretty , iloveswitzerland , instaphoto , lovely , watchporn , jungfrau , passion , wallis , airport , perfect , hotel , tourism , shopping , friendship , funny , monday , easterweekend , swag , instamoment , flower , berge , the , froheostern , thun , iphoneonly , mylove , pic , skyporn , bar , brunch , neuchatel , loveswitzerlandcontest , powder , fresh , evening , makeup , boy , hair , vegan , hot , wonderful , in , color , house , tourist , instafollow , eurotrip , swizerland , top , friday , lago , traveller , suica , pictureoftheday , instago , eggs , gold , potd , rhein , polymanga , wine , stgallen , rolex , mood , austria , dance , swan , fribourg , goodday , alpen , igdaily , printemps , swisslife , lamborghini , paris , apresski , march , boat , schnee , portrait , oldtown , crazy , lac , birthday , urban , f4f , arosa , tflers , latergram , swissmountains , winterwonderland , vaud , jetdeau , streetart , cat , bmw , ig_europe , titlis , look , fitfam , enjoylife , and , traveltheworld , blessed , paradise , montagne , outdoors , ig_swiss , vevey , dessert , couple , sunglasses , bike , zug , winterthur , hiphop , cars , baby , club , animal , ferrari , vintage , natur , friend , museum , qualitytime , carporn , goodvibes , loveyou , wood , igtravel , lindt , instagramers , italia , goodtime , buonapasqua , nature_perfection , fly , nofilterneeded , today , audi , bw , eat , shooting , watchesofinstagram , walking , supercar , igerssuisse , lakezurich , garden , likes , great , dj , traveler , super_switzerland , yellow , artist , porsche , landscape_lovers , drinks , happyday , handmade , run , naturephotography , goodnight , vierwaldstattersee , people , blonde , visitzurich , london , cheese , easterbunny , outdoor , fondue , ascona , followforfollow , watchoftheday , leman , lagomaggiore , streetphotography , reflection , lights , building , ice , iphone , genevalake , health , freeride , bodybuilding , igersgeneva , champagne , waterfall , beard , chilling , cloudporn , sister , primavera , dream , starbucks , instafashion , aviation , springbreak , rheinfall , dogsofinstagram , polymanga2016 , liechtenstein , zurichcity , igersswitzerland , blogger , instanature , scenery , schaffhausen , outfit , horology , liveauthentic , shoes , nightlife , animals , adidas , interiordesign , instatraveling , jewelry , homemade , cake , tasty , nightout , wow , zurichairport , gstaad , mylife , rigi , video , mercedes , a , all_shots , flying , moment , deutschland , zurisee , mytravelgram , forever , beach , park ,\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "for order in ordered:\n", | |
| " print order[0],\",\"," | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "with open('/home/volodymyrmiz/Desktop/rawTags.txt', 'w') as f:\n", | |
| " for tag in ordered:\n", | |
| " if tag[0] != '':\n", | |
| " f.write((tag[0] + ' ')*(tag[1] / 10))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from matplotlib import pyplot as plt\n", | |
| "import matplotlib\n", | |
| "matplotlib.style.use('ggplot')\n", | |
| "import numpy as np" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "frequentTags = [tag[0] for tag in ordered]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "frequency = [tag[1] for tag in ordered]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "y_pos = np.arange(len(frequentTags))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "#plt.barh(y_pos, frequency, alpha=0.5)\n", | |
| "#plt.yticks(y_pos, frequentTags)\n", | |
| "#plt.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Find words co-occurences" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "userTags = []\n", | |
| "for user in tagsDB.find():\n", | |
| " userTags.append([unicodedata.normalize('NFKD', tag).encode('ascii','ignore') \n", | |
| " for tag in user['tags'] \n", | |
| " if unicodedata.normalize('NFKD', tag).encode('ascii','ignore') != ''])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['contiki',\n", | |
| " 'swissalps',\n", | |
| " 'newfriends',\n", | |
| " 'freezingmynutsoff',\n", | |
| " 'walkabout',\n", | |
| " 'jungfraujoch',\n", | |
| " 'yolo',\n", | |
| " 'travel',\n", | |
| " 'noregrets']" | |
| ] | |
| }, | |
| "execution_count": 21, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "userTags[0]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Co-occurrence for train:\n", | |
| "train\n", | |
| "switzerland\n", | |
| "travel\n", | |
| "zurich\n", | |
| "mountains\n", | |
| "swiss\n", | |
| "easter\n", | |
| "snow\n", | |
| "nature\n", | |
| "lake\n", | |
| "photooftheday\n", | |
| "alps\n", | |
| "sky\n", | |
| "beautiful\n", | |
| "europe\n", | |
| "spring\n", | |
| "sbb\n", | |
| "view\n", | |
| "clouds\n", | |
| "love\n", | |
| "instagood\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from collections import Counter\n", | |
| "search_word = \"train\"\n", | |
| "count_search = Counter()\n", | |
| "for tag in userTags:\n", | |
| " if search_word in tag:\n", | |
| " count_search.update(tag)\n", | |
| "print(\"Co-occurrence for %s:\" % search_word)\n", | |
| "for word in count_search.most_common(21):\n", | |
| " print word[0]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Topic analysis using LDA" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "LDA.\n", | |
| "As with many\n", | |
| "clustering models, such a model restricts a document to being associated with a single topic. LDA,\n", | |
| "on the other hand, involves three levels, and notably the topic node is sampled repeatedly within the\n", | |
| "document. Under this model, documents can be associated with multiple topics." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from pyspark.mllib.clustering import LDA, LDAModel\n", | |
| "from pyspark.mllib.linalg import Vectors" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "tagsList = []\n", | |
| "for tag in tagsDB.find():\n", | |
| " tagsList.append((str(tag['_id']), [unicodedata.normalize('NFKD', t).encode('ascii','ignore') \n", | |
| " for t in tag['tags']\n", | |
| " if unicodedata.normalize('NFKD', t).encode('ascii','ignore') != '']))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "#### Filter tag list of each user. Remove the most common and rarely used ones" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "filteredList = []\n", | |
| "for tag in tagsList:\n", | |
| " filteredList.append((tag[0], list(set(tag[1]).intersection(frequentTags[:]))))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "tagsListDF = sc.parallelize(filteredList).toDF([\"id\", \"tokens\"])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "#### Vectorize tags arrays for each user" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 28, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from pyspark.ml.feature import CountVectorizer" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 29, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "vectorizer = CountVectorizer(inputCol=\"tokens\", outputCol=\"features\").fit(tagsListDF)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 30, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "countVectors = vectorizer.transform(tagsListDF).select(\"id\", \"features\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 31, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[Row(id=u'234933728', features=SparseVector(499, {6: 1.0, 30: 1.0}))]" | |
| ] | |
| }, | |
| "execution_count": 31, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "countVectors.take(1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "#### Find TF-IDF coefficients for each word instead of bag of words" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 32, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from pyspark.mllib.feature import IDF" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 33, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "frequencyVectors = countVectors.map(lambda vector: vector[1])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 34, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[SparseVector(499, {6: 1.0, 30: 1.0}), SparseVector(499, {113: 1.0, 210: 1.0})]" | |
| ] | |
| }, | |
| "execution_count": 34, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "frequencyVectors.take(2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 35, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "frequencyVectors.cache()\n", | |
| "idf = IDF().fit(frequencyVectors)\n", | |
| "tfidf = idf.transform(frequencyVectors)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 36, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[SparseVector(499, {6: 2.8768, 30: 3.7561})]" | |
| ] | |
| }, | |
| "execution_count": 36, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "tfidf.take(1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 37, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "#just in case, if ids are needed\n", | |
| "tfidf_with_ids = countVectors.map(lambda vector: int(vector[0])).zip(tfidf).map(lambda pair: [pair[0], pair[1]])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 38, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[[234933728, SparseVector(499, {6: 2.8768, 30: 3.7561})]]" | |
| ] | |
| }, | |
| "execution_count": 38, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "tfidf_with_ids.take(1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 39, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "corpus = tfidf.map(lambda x: [1, x]).cache()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 40, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[[1, SparseVector(499, {6: 2.8768, 30: 3.7561})],\n", | |
| " [1, SparseVector(499, {113: 4.6173, 210: 5.1634})],\n", | |
| " [1, SparseVector(499, {})],\n", | |
| " [1, SparseVector(499, {22: 3.4672})],\n", | |
| " [1,\n", | |
| " SparseVector(499, {2: 2.6026, 8: 2.9656, 13: 3.1635, 16: 3.2196, 17: 3.3231, 18: 3.3302, 20: 3.3766, 23: 3.4764, 35: 3.8386, 43: 3.946, 45: 3.9744, 63: 4.1881, 76: 4.2805, 85: 4.3708, 89: 4.4099, 109: 4.5946, 127: 4.7141, 327: 5.5614, 401: 5.7571})],\n", | |
| " [1, SparseVector(499, {418: 5.7753})],\n", | |
| " [1, SparseVector(499, {})],\n", | |
| " [1, SparseVector(499, {2: 2.6026, 24: 3.5149, 114: 4.6289, 176: 5.0243})],\n", | |
| " [1, SparseVector(499, {158: 4.9412})],\n", | |
| " [1, SparseVector(499, {})]]" | |
| ] | |
| }, | |
| "execution_count": 40, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "corpus.take(10)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "#### Build Latent Dirichlet Allocation model for clustering" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 42, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "ldaModel = LDA.train(corpus, k = 15, maxIterations=100, optimizer=\"online\", docConcentration=2.0, topicConcentration=3.0)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Note: LDA does not perform well with the EMLDAOptimizer which is used by default. In the case of EMLDAOptimizer we have significant bies to the most popular hashtags. I used the OnlineLDAOptimizer instead. The Optimizer implements the Online variational Bayes LDA algorithm, which processes a subset of the corpus on each iteration, and updates the term-topic distribution adaptively." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 43, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "499" | |
| ] | |
| }, | |
| "execution_count": 43, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "len(ldaModel.topicsMatrix())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 44, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "topicIndices = ldaModel.describeTopics(maxTermsPerTopic=5)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 45, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "([8, 23, 5, 64, 28],\n", | |
| " [0.06482984277484072,\n", | |
| " 0.045950294558274096,\n", | |
| " 0.039156100706073844,\n", | |
| " 0.031679928472898536,\n", | |
| " 0.030337389898223453])" | |
| ] | |
| }, | |
| "execution_count": 45, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "topicIndices[0]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 46, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "vocablist = vectorizer.vocabulary" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 47, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "<bound method LDAModel.vocabSize of <pyspark.mllib.clustering.LDAModel object at 0x7f959069d210>>" | |
| ] | |
| }, | |
| "execution_count": 47, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "ldaModel.vocabSize" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 48, | |
| "metadata": { | |
| "collapsed": false, | |
| "scrolled": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# from operator import itemgetter \n", | |
| "# for topic in topicIndices:\n", | |
| "# text = itemgetter(*topic[0])(vocablist)\n", | |
| "# print \"TOPIC\"\n", | |
| "# for tag in text:\n", | |
| "# print tag, topic[1][text.index(tag)]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Visualization" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 49, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "topicsRDD = sc.parallelize(topicIndices)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 50, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import operator\n", | |
| "termsRDD = topicsRDD.map(lambda topic: (zip(operator.itemgetter(*topic[0])(vocablist), topic[1])))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 51, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[[(u'snow', 0.06482984277484072),\n", | |
| " (u'ski', 0.045950294558274096),\n", | |
| " (u'mountains', 0.039156100706073844),\n", | |
| " (u'winter', 0.031679928472898536),\n", | |
| " (u'skiing', 0.030337389898223453)],\n", | |
| " [(u'nature', 0.030827512537037274),\n", | |
| " (u'lake', 0.02478979431897514),\n", | |
| " (u'spring', 0.024462219076508293),\n", | |
| " (u'landscape', 0.0224125857946134),\n", | |
| " (u'flowers', 0.02067768513339612)],\n", | |
| " [(u'luzern', 0.036862489880800375),\n", | |
| " (u'switzerland', 0.0338325739317431),\n", | |
| " (u'verbier', 0.02953757773230965),\n", | |
| " (u'zurich', 0.02188718968235265),\n", | |
| " (u'swiss', 0.02123679980747416)],\n", | |
| " [(u'art', 0.03698756198224295),\n", | |
| " (u'zurich', 0.03313016298308508),\n", | |
| " (u'switzerland', 0.03157849816347215),\n", | |
| " (u'easter', 0.025158555831459168),\n", | |
| " (u'family', 0.0223036711631496)],\n", | |
| " [(u'travel', 0.06901255967841895),\n", | |
| " (u'instatravel', 0.037299454043090645),\n", | |
| " (u'europe', 0.034330202644613936),\n", | |
| " (u'travelgram', 0.03368921018190022),\n", | |
| " (u'trip', 0.03304379062370829)],\n", | |
| " [(u'switzerland', 0.034456317666373186),\n", | |
| " (u'goodtimes', 0.03152461878027823),\n", | |
| " (u'zurich', 0.02950826562264097),\n", | |
| " (u'weekend', 0.022231218091606136),\n", | |
| " (u'tb', 0.018888255116828026)],\n", | |
| " [(u'geneva', 0.05857098842284779),\n", | |
| " (u'car', 0.031332107870771786),\n", | |
| " (u'switzerland', 0.0276575550779648),\n", | |
| " (u'lamborghini', 0.02190996776182064),\n", | |
| " (u'ferrari', 0.02052769480440934)],\n", | |
| " [(u'visitswitzerland', 0.03794345847628957),\n", | |
| " (u'vscocam', 0.034333155243237684),\n", | |
| " (u'switzerlandwonderland', 0.03226477690795611),\n", | |
| " (u'vsco', 0.03213684652910808),\n", | |
| " (u'myswitzerland', 0.028912085943440736)],\n", | |
| " [(u'fitness', 0.050251802705119475),\n", | |
| " (u'healthy', 0.02958433988432552),\n", | |
| " (u'sport', 0.029353669207932892),\n", | |
| " (u'workout', 0.028909345997833683),\n", | |
| " (u'motivation', 0.028554499225599026)],\n", | |
| " [(u'suisse', 0.04139109858365735),\n", | |
| " (u'montreux', 0.03704523460681258),\n", | |
| " (u'lacleman', 0.03250671538638928),\n", | |
| " (u'lausanne', 0.02801144795578456),\n", | |
| " (u'switzerland', 0.02688108605034323)],\n", | |
| " [(u'instagood', 0.03645624632489141),\n", | |
| " (u'picoftheday', 0.03076251964509694),\n", | |
| " (u'photooftheday', 0.029703059421505945),\n", | |
| " (u'instadaily', 0.028632883460075267),\n", | |
| " (u'instalike', 0.026769381245962745)],\n", | |
| " [(u'music', 0.040575034449173424),\n", | |
| " (u'party', 0.03921196577945221),\n", | |
| " (u'friends', 0.027131120772706),\n", | |
| " (u'konstanz', 0.024898402612021024),\n", | |
| " (u'bodensee', 0.02318591615707108)],\n", | |
| " [(u'baselworld2016', 0.056018990170552174),\n", | |
| " (u'baselworld', 0.05048503702936801),\n", | |
| " (u'basel', 0.04590056891456354),\n", | |
| " (u'luxury', 0.041513618850625066),\n", | |
| " (u'watches', 0.03597274460743156)],\n", | |
| " [(u'swiss', 0.02934004343949627),\n", | |
| " (u'nofilter', 0.02346935547247407),\n", | |
| " (u'switzerland', 0.021960428794446603),\n", | |
| " (u'selfie', 0.020904185010864756),\n", | |
| " (u'love', 0.019373152419980853)],\n", | |
| " [(u'food', 0.030915742768915375),\n", | |
| " (u'foodporn', 0.03030777683238159),\n", | |
| " (u'day', 0.02158525753702872),\n", | |
| " (u'instafood', 0.020540403766244577),\n", | |
| " (u'yummy', 0.019046245223605044)]]" | |
| ] | |
| }, | |
| "execution_count": 51, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "termsRDD.take(25)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 52, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "indexedTermsRDD = termsRDD.zipWithIndex()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 53, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "termsRDD = indexedTermsRDD.flatMap(lambda term: [(t[0], t[1], term[1]) for t in term[0]])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 54, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "termDF = termsRDD.toDF(['term', 'probability', 'topicId'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 55, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[Row(term=u'snow', probability=0.06482984277484072, topicId=0),\n", | |
| " Row(term=u'ski', probability=0.045950294558274096, topicId=0),\n", | |
| " Row(term=u'mountains', probability=0.039156100706073844, topicId=0),\n", | |
| " Row(term=u'winter', probability=0.031679928472898536, topicId=0),\n", | |
| " Row(term=u'skiing', probability=0.030337389898223453, topicId=0),\n", | |
| " Row(term=u'nature', probability=0.030827512537037274, topicId=1),\n", | |
| " Row(term=u'lake', probability=0.02478979431897514, topicId=1),\n", | |
| " Row(term=u'spring', probability=0.024462219076508293, topicId=1),\n", | |
| " Row(term=u'landscape', probability=0.0224125857946134, topicId=1),\n", | |
| " Row(term=u'flowers', probability=0.02067768513339612, topicId=1)]" | |
| ] | |
| }, | |
| "execution_count": 55, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "termDF.take(10)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 56, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "rawJson = termDF.toJSON().collect()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 57, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from IPython.core.display import display, HTML\n", | |
| "from IPython.display import Javascript\n", | |
| "\n", | |
| "s = \"\"\n", | |
| "for line in rawJson:\n", | |
| " s += (str(line) +',')\n", | |
| "stringJson = s[:-1]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 58, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'{\"term\":\"snow\",\"probability\":0.06482984277484072,\"topicId\":0},{\"term\":\"ski\",\"probability\":0.045950294558274096,\"topicId\":0},{\"term\":\"mountains\",\"probability\":0.039156100706073844,\"topicId\":0},{\"term\":\"winter\",\"probability\":0.031679928472898536,\"topicId\":0},{\"term\":\"skiing\",\"probability\":0.030337389898223453,\"topicId\":0},{\"term\":\"nature\",\"probability\":0.030827512537037274,\"topicId\":1},{\"term\":\"lake\",\"probability\":0.02478979431897514,\"topicId\":1},{\"term\":\"spring\",\"probability\":0.024462219076508293,\"topicId\":1},{\"term\":\"landscape\",\"probability\":0.0224125857946134,\"topicId\":1},{\"term\":\"flowers\",\"probability\":0.02067768513339612,\"topicId\":1},{\"term\":\"luzern\",\"probability\":0.036862489880800375,\"topicId\":2},{\"term\":\"switzerland\",\"probability\":0.0338325739317431,\"topicId\":2},{\"term\":\"verbier\",\"probability\":0.02953757773230965,\"topicId\":2},{\"term\":\"zurich\",\"probability\":0.02188718968235265,\"topicId\":2},{\"term\":\"swiss\",\"probability\":0.02123679980747416,\"topicId\":2},{\"term\":\"art\",\"probability\":0.03698756198224295,\"topicId\":3},{\"term\":\"zurich\",\"probability\":0.03313016298308508,\"topicId\":3},{\"term\":\"switzerland\",\"probability\":0.03157849816347215,\"topicId\":3},{\"term\":\"easter\",\"probability\":0.025158555831459168,\"topicId\":3},{\"term\":\"family\",\"probability\":0.0223036711631496,\"topicId\":3},{\"term\":\"travel\",\"probability\":0.06901255967841895,\"topicId\":4},{\"term\":\"instatravel\",\"probability\":0.037299454043090645,\"topicId\":4},{\"term\":\"europe\",\"probability\":0.034330202644613936,\"topicId\":4},{\"term\":\"travelgram\",\"probability\":0.03368921018190022,\"topicId\":4},{\"term\":\"trip\",\"probability\":0.03304379062370829,\"topicId\":4},{\"term\":\"switzerland\",\"probability\":0.034456317666373186,\"topicId\":5},{\"term\":\"goodtimes\",\"probability\":0.03152461878027823,\"topicId\":5},{\"term\":\"zurich\",\"probability\":0.02950826562264097,\"topicId\":5},{\"term\":\"weekend\",\"probability\":0.022231218091606136,\"topicId\":5},{\"term\":\"tb\",\"probability\":0.018888255116828026,\"topicId\":5},{\"term\":\"geneva\",\"probability\":0.05857098842284779,\"topicId\":6},{\"term\":\"car\",\"probability\":0.031332107870771786,\"topicId\":6},{\"term\":\"switzerland\",\"probability\":0.0276575550779648,\"topicId\":6},{\"term\":\"lamborghini\",\"probability\":0.02190996776182064,\"topicId\":6},{\"term\":\"ferrari\",\"probability\":0.02052769480440934,\"topicId\":6},{\"term\":\"visitswitzerland\",\"probability\":0.03794345847628957,\"topicId\":7},{\"term\":\"vscocam\",\"probability\":0.034333155243237684,\"topicId\":7},{\"term\":\"switzerlandwonderland\",\"probability\":0.03226477690795611,\"topicId\":7},{\"term\":\"vsco\",\"probability\":0.03213684652910808,\"topicId\":7},{\"term\":\"myswitzerland\",\"probability\":0.028912085943440736,\"topicId\":7},{\"term\":\"fitness\",\"probability\":0.050251802705119475,\"topicId\":8},{\"term\":\"healthy\",\"probability\":0.02958433988432552,\"topicId\":8},{\"term\":\"sport\",\"probability\":0.029353669207932892,\"topicId\":8},{\"term\":\"workout\",\"probability\":0.028909345997833683,\"topicId\":8},{\"term\":\"motivation\",\"probability\":0.028554499225599026,\"topicId\":8},{\"term\":\"suisse\",\"probability\":0.04139109858365735,\"topicId\":9},{\"term\":\"montreux\",\"probability\":0.03704523460681258,\"topicId\":9},{\"term\":\"lacleman\",\"probability\":0.03250671538638928,\"topicId\":9},{\"term\":\"lausanne\",\"probability\":0.02801144795578456,\"topicId\":9},{\"term\":\"switzerland\",\"probability\":0.02688108605034323,\"topicId\":9},{\"term\":\"instagood\",\"probability\":0.03645624632489141,\"topicId\":10},{\"term\":\"picoftheday\",\"probability\":0.03076251964509694,\"topicId\":10},{\"term\":\"photooftheday\",\"probability\":0.029703059421505945,\"topicId\":10},{\"term\":\"instadaily\",\"probability\":0.028632883460075267,\"topicId\":10},{\"term\":\"instalike\",\"probability\":0.026769381245962745,\"topicId\":10},{\"term\":\"music\",\"probability\":0.040575034449173424,\"topicId\":11},{\"term\":\"party\",\"probability\":0.03921196577945221,\"topicId\":11},{\"term\":\"friends\",\"probability\":0.027131120772706,\"topicId\":11},{\"term\":\"konstanz\",\"probability\":0.024898402612021024,\"topicId\":11},{\"term\":\"bodensee\",\"probability\":0.02318591615707108,\"topicId\":11},{\"term\":\"baselworld2016\",\"probability\":0.056018990170552174,\"topicId\":12},{\"term\":\"baselworld\",\"probability\":0.05048503702936801,\"topicId\":12},{\"term\":\"basel\",\"probability\":0.04590056891456354,\"topicId\":12},{\"term\":\"luxury\",\"probability\":0.041513618850625066,\"topicId\":12},{\"term\":\"watches\",\"probability\":0.03597274460743156,\"topicId\":12},{\"term\":\"swiss\",\"probability\":0.02934004343949627,\"topicId\":13},{\"term\":\"nofilter\",\"probability\":0.02346935547247407,\"topicId\":13},{\"term\":\"switzerland\",\"probability\":0.021960428794446603,\"topicId\":13},{\"term\":\"selfie\",\"probability\":0.020904185010864756,\"topicId\":13},{\"term\":\"love\",\"probability\":0.019373152419980853,\"topicId\":13},{\"term\":\"food\",\"probability\":0.030915742768915375,\"topicId\":14},{\"term\":\"foodporn\",\"probability\":0.03030777683238159,\"topicId\":14},{\"term\":\"day\",\"probability\":0.02158525753702872,\"topicId\":14},{\"term\":\"instafood\",\"probability\":0.020540403766244577,\"topicId\":14},{\"term\":\"yummy\",\"probability\":0.019046245223605044,\"topicId\":14}'" | |
| ] | |
| }, | |
| "execution_count": 58, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "stringJson" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 59, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "html_code = \"\"\"\n", | |
| "<!DOCTYPE html>\n", | |
| "<meta charset=\"utf-8\">\n", | |
| "<style>\n", | |
| "\n", | |
| "circle {\n", | |
| " fill: rgb(31, 119, 180);\n", | |
| " fill-opacity: 0.5;\n", | |
| " stroke: rgb(31, 119, 180);\n", | |
| " stroke-width: 1px;\n", | |
| "}\n", | |
| "\n", | |
| ".leaf circle {\n", | |
| " fill: #ff7f0e;\n", | |
| " fill-opacity: 1;\n", | |
| "}\n", | |
| "\n", | |
| "text {\n", | |
| " font: 14px sans-serif;\n", | |
| "}\n", | |
| "\n", | |
| "</style>\n", | |
| "<body>\n", | |
| "<script src=\"https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js\"></script>\n", | |
| "\n", | |
| "<script>\n", | |
| "\n", | |
| "var json = {\n", | |
| " \"name\": \"data\",\n", | |
| " \"children\": [\n", | |
| " {\n", | |
| " \"name\": \"topics\",\n", | |
| " \"children\": [\n", | |
| " %s\n", | |
| " ]\n", | |
| " }\n", | |
| " ]\n", | |
| "};\n", | |
| "\n", | |
| "var r = 1500,\n", | |
| " format = d3.format(\",d\"),\n", | |
| " fill = d3.scale.category20c();\n", | |
| "\n", | |
| "var bubble = d3.layout.pack()\n", | |
| " .sort(null)\n", | |
| " .size([r, r])\n", | |
| " .padding(1.5);\n", | |
| "\n", | |
| "var vis = d3.select(\"body\").append(\"svg\")\n", | |
| " .attr(\"width\", r)\n", | |
| " .attr(\"height\", r)\n", | |
| " .attr(\"class\", \"bubble\");\n", | |
| "\n", | |
| " \n", | |
| "var node = vis.selectAll(\"g.node\")\n", | |
| " .data(bubble.nodes(classes(json))\n", | |
| " .filter(function(d) { return !d.children; }))\n", | |
| " .enter().append(\"g\")\n", | |
| " .attr(\"class\", \"node\")\n", | |
| " .attr(\"transform\", function(d) { return \"translate(\" + d.x + \",\" + d.y + \")\"; })\n", | |
| " color = d3.scale.category20();\n", | |
| " \n", | |
| " node.append(\"title\")\n", | |
| " .text(function(d) { return d.className + \": \" + format(d.value); });\n", | |
| "\n", | |
| " node.append(\"circle\")\n", | |
| " .attr(\"r\", function(d) { return d.r; })\n", | |
| " .style(\"fill\", function(d) {return color(d.topicName);});\n", | |
| "\n", | |
| "var text = node.append(\"text\")\n", | |
| " .attr(\"text-anchor\", \"middle\")\n", | |
| " .attr(\"dy\", \".3em\")\n", | |
| " .text(function(d) { return d.className.substring(0, d.r / 3)});\n", | |
| " \n", | |
| " text.append(\"tspan\")\n", | |
| " .attr(\"dy\", \"1.2em\")\n", | |
| " .attr(\"x\", 0)\n", | |
| " .text(function(d) {return Math.ceil(d.value * 10000) /10000; });\n", | |
| "\n", | |
| "// Returns a flattened hierarchy containing all leaf nodes under the root.\n", | |
| "function classes(root) {\n", | |
| " var classes = [];\n", | |
| "\n", | |
| " function recurse(term, node) {\n", | |
| " if (node.children) node.children.forEach(function(child) { recurse(node.term, child); });\n", | |
| " else classes.push({topicName: node.topicId, className: node.term, value: node.probability});\n", | |
| " }\n", | |
| "\n", | |
| " recurse(null, root);\n", | |
| " return {children: classes};\n", | |
| "}\n", | |
| "\n", | |
| "</script>\"\"\" % stringJson" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 60, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "\n", | |
| "<!DOCTYPE html>\n", | |
| "<meta charset=\"utf-8\">\n", | |
| "<style>\n", | |
| "\n", | |
| "circle {\n", | |
| " fill: rgb(31, 119, 180);\n", | |
| " fill-opacity: 0.5;\n", | |
| " stroke: rgb(31, 119, 180);\n", | |
| " stroke-width: 1px;\n", | |
| "}\n", | |
| "\n", | |
| ".leaf circle {\n", | |
| " fill: #ff7f0e;\n", | |
| " fill-opacity: 1;\n", | |
| "}\n", | |
| "\n", | |
| "text {\n", | |
| " font: 14px sans-serif;\n", | |
| "}\n", | |
| "\n", | |
| "</style>\n", | |
| "<body>\n", | |
| "<script src=\"https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js\"></script>\n", | |
| "\n", | |
| "<script>\n", | |
| "\n", | |
| "var json = {\n", | |
| " \"name\": \"data\",\n", | |
| " \"children\": [\n", | |
| " {\n", | |
| " \"name\": \"topics\",\n", | |
| " \"children\": [\n", | |
| " {\"term\":\"snow\",\"probability\":0.06482984277484072,\"topicId\":0},{\"term\":\"ski\",\"probability\":0.045950294558274096,\"topicId\":0},{\"term\":\"mountains\",\"probability\":0.039156100706073844,\"topicId\":0},{\"term\":\"winter\",\"probability\":0.031679928472898536,\"topicId\":0},{\"term\":\"skiing\",\"probability\":0.030337389898223453,\"topicId\":0},{\"term\":\"nature\",\"probability\":0.030827512537037274,\"topicId\":1},{\"term\":\"lake\",\"probability\":0.02478979431897514,\"topicId\":1},{\"term\":\"spring\",\"probability\":0.024462219076508293,\"topicId\":1},{\"term\":\"landscape\",\"probability\":0.0224125857946134,\"topicId\":1},{\"term\":\"flowers\",\"probability\":0.02067768513339612,\"topicId\":1},{\"term\":\"luzern\",\"probability\":0.036862489880800375,\"topicId\":2},{\"term\":\"switzerland\",\"probability\":0.0338325739317431,\"topicId\":2},{\"term\":\"verbier\",\"probability\":0.02953757773230965,\"topicId\":2},{\"term\":\"zurich\",\"probability\":0.02188718968235265,\"topicId\":2},{\"term\":\"swiss\",\"probability\":0.02123679980747416,\"topicId\":2},{\"term\":\"art\",\"probability\":0.03698756198224295,\"topicId\":3},{\"term\":\"zurich\",\"probability\":0.03313016298308508,\"topicId\":3},{\"term\":\"switzerland\",\"probability\":0.03157849816347215,\"topicId\":3},{\"term\":\"easter\",\"probability\":0.025158555831459168,\"topicId\":3},{\"term\":\"family\",\"probability\":0.0223036711631496,\"topicId\":3},{\"term\":\"travel\",\"probability\":0.06901255967841895,\"topicId\":4},{\"term\":\"instatravel\",\"probability\":0.037299454043090645,\"topicId\":4},{\"term\":\"europe\",\"probability\":0.034330202644613936,\"topicId\":4},{\"term\":\"travelgram\",\"probability\":0.03368921018190022,\"topicId\":4},{\"term\":\"trip\",\"probability\":0.03304379062370829,\"topicId\":4},{\"term\":\"switzerland\",\"probability\":0.034456317666373186,\"topicId\":5},{\"term\":\"goodtimes\",\"probability\":0.03152461878027823,\"topicId\":5},{\"term\":\"zurich\",\"probability\":0.02950826562264097,\"topicId\":5},{\"term\":\"weekend\",\"probability\":0.022231218091606136,\"topicId\":5},{\"term\":\"tb\",\"probability\":0.018888255116828026,\"topicId\":5},{\"term\":\"geneva\",\"probability\":0.05857098842284779,\"topicId\":6},{\"term\":\"car\",\"probability\":0.031332107870771786,\"topicId\":6},{\"term\":\"switzerland\",\"probability\":0.0276575550779648,\"topicId\":6},{\"term\":\"lamborghini\",\"probability\":0.02190996776182064,\"topicId\":6},{\"term\":\"ferrari\",\"probability\":0.02052769480440934,\"topicId\":6},{\"term\":\"visitswitzerland\",\"probability\":0.03794345847628957,\"topicId\":7},{\"term\":\"vscocam\",\"probability\":0.034333155243237684,\"topicId\":7},{\"term\":\"switzerlandwonderland\",\"probability\":0.03226477690795611,\"topicId\":7},{\"term\":\"vsco\",\"probability\":0.03213684652910808,\"topicId\":7},{\"term\":\"myswitzerland\",\"probability\":0.028912085943440736,\"topicId\":7},{\"term\":\"fitness\",\"probability\":0.050251802705119475,\"topicId\":8},{\"term\":\"healthy\",\"probability\":0.02958433988432552,\"topicId\":8},{\"term\":\"sport\",\"probability\":0.029353669207932892,\"topicId\":8},{\"term\":\"workout\",\"probability\":0.028909345997833683,\"topicId\":8},{\"term\":\"motivation\",\"probability\":0.028554499225599026,\"topicId\":8},{\"term\":\"suisse\",\"probability\":0.04139109858365735,\"topicId\":9},{\"term\":\"montreux\",\"probability\":0.03704523460681258,\"topicId\":9},{\"term\":\"lacleman\",\"probability\":0.03250671538638928,\"topicId\":9},{\"term\":\"lausanne\",\"probability\":0.02801144795578456,\"topicId\":9},{\"term\":\"switzerland\",\"probability\":0.02688108605034323,\"topicId\":9},{\"term\":\"instagood\",\"probability\":0.03645624632489141,\"topicId\":10},{\"term\":\"picoftheday\",\"probability\":0.03076251964509694,\"topicId\":10},{\"term\":\"photooftheday\",\"probability\":0.029703059421505945,\"topicId\":10},{\"term\":\"instadaily\",\"probability\":0.028632883460075267,\"topicId\":10},{\"term\":\"instalike\",\"probability\":0.026769381245962745,\"topicId\":10},{\"term\":\"music\",\"probability\":0.040575034449173424,\"topicId\":11},{\"term\":\"party\",\"probability\":0.03921196577945221,\"topicId\":11},{\"term\":\"friends\",\"probability\":0.027131120772706,\"topicId\":11},{\"term\":\"konstanz\",\"probability\":0.024898402612021024,\"topicId\":11},{\"term\":\"bodensee\",\"probability\":0.02318591615707108,\"topicId\":11},{\"term\":\"baselworld2016\",\"probability\":0.056018990170552174,\"topicId\":12},{\"term\":\"baselworld\",\"probability\":0.05048503702936801,\"topicId\":12},{\"term\":\"basel\",\"probability\":0.04590056891456354,\"topicId\":12},{\"term\":\"luxury\",\"probability\":0.041513618850625066,\"topicId\":12},{\"term\":\"watches\",\"probability\":0.03597274460743156,\"topicId\":12},{\"term\":\"swiss\",\"probability\":0.02934004343949627,\"topicId\":13},{\"term\":\"nofilter\",\"probability\":0.02346935547247407,\"topicId\":13},{\"term\":\"switzerland\",\"probability\":0.021960428794446603,\"topicId\":13},{\"term\":\"selfie\",\"probability\":0.020904185010864756,\"topicId\":13},{\"term\":\"love\",\"probability\":0.019373152419980853,\"topicId\":13},{\"term\":\"food\",\"probability\":0.030915742768915375,\"topicId\":14},{\"term\":\"foodporn\",\"probability\":0.03030777683238159,\"topicId\":14},{\"term\":\"day\",\"probability\":0.02158525753702872,\"topicId\":14},{\"term\":\"instafood\",\"probability\":0.020540403766244577,\"topicId\":14},{\"term\":\"yummy\",\"probability\":0.019046245223605044,\"topicId\":14}\n", | |
| " ]\n", | |
| " }\n", | |
| " ]\n", | |
| "};\n", | |
| "\n", | |
| "var r = 1500,\n", | |
| " format = d3.format(\",d\"),\n", | |
| " fill = d3.scale.category20c();\n", | |
| "\n", | |
| "var bubble = d3.layout.pack()\n", | |
| " .sort(null)\n", | |
| " .size([r, r])\n", | |
| " .padding(1.5);\n", | |
| "\n", | |
| "var vis = d3.select(\"body\").append(\"svg\")\n", | |
| " .attr(\"width\", r)\n", | |
| " .attr(\"height\", r)\n", | |
| " .attr(\"class\", \"bubble\");\n", | |
| "\n", | |
| " \n", | |
| "var node = vis.selectAll(\"g.node\")\n", | |
| " .data(bubble.nodes(classes(json))\n", | |
| " .filter(function(d) { return !d.children; }))\n", | |
| " .enter().append(\"g\")\n", | |
| " .attr(\"class\", \"node\")\n", | |
| " .attr(\"transform\", function(d) { return \"translate(\" + d.x + \",\" + d.y + \")\"; })\n", | |
| " color = d3.scale.category20();\n", | |
| " \n", | |
| " node.append(\"title\")\n", | |
| " .text(function(d) { return d.className + \": \" + format(d.value); });\n", | |
| "\n", | |
| " node.append(\"circle\")\n", | |
| " .attr(\"r\", function(d) { return d.r; })\n", | |
| " .style(\"fill\", function(d) {return color(d.topicName);});\n", | |
| "\n", | |
| "var text = node.append(\"text\")\n", | |
| " .attr(\"text-anchor\", \"middle\")\n", | |
| " .attr(\"dy\", \".3em\")\n", | |
| " .text(function(d) { return d.className.substring(0, d.r / 3)});\n", | |
| " \n", | |
| " text.append(\"tspan\")\n", | |
| " .attr(\"dy\", \"1.2em\")\n", | |
| " .attr(\"x\", 0)\n", | |
| " .text(function(d) {return Math.ceil(d.value * 10000) /10000; });\n", | |
| "\n", | |
| "// Returns a flattened hierarchy containing all leaf nodes under the root.\n", | |
| "function classes(root) {\n", | |
| " var classes = [];\n", | |
| "\n", | |
| " function recurse(term, node) {\n", | |
| " if (node.children) node.children.forEach(function(child) { recurse(node.term, child); });\n", | |
| " else classes.push({topicName: node.topicId, className: node.term, value: node.probability});\n", | |
| " }\n", | |
| "\n", | |
| " recurse(null, root);\n", | |
| " return {children: classes};\n", | |
| "}\n", | |
| "\n", | |
| "</script>" | |
| ], | |
| "text/plain": [ | |
| "<IPython.core.display.HTML object>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "display(HTML(html_code))" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 2", | |
| "language": "python", | |
| "name": "python2" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 2 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython2", | |
| "version": "2.7.12" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 1 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment