Created
December 5, 2016 14:25
-
-
Save AashishTiwari/f00bda47eeb34ca5bc9b10b3685ac1ab to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": true | |
}, | |
"source": [ | |
"### Visualizing Clusters of UP Vidhan Sabha Headlines Using R and Plotly\n", | |
"-- by Aashish K Tiwari" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"Attaching package: ‘dplyr’\n", | |
"\n", | |
"The following objects are masked from ‘package:stats’:\n", | |
"\n", | |
" filter, lag\n", | |
"\n", | |
"The following objects are masked from ‘package:base’:\n", | |
"\n", | |
" intersect, setdiff, setequal, union\n", | |
"\n", | |
"Registering fonts with R\n", | |
"\n", | |
"Attaching package: ‘scales’\n", | |
"\n", | |
"The following objects are masked from ‘package:readr’:\n", | |
"\n", | |
" col_factor, col_numeric\n", | |
"\n", | |
"\n", | |
"Attaching package: ‘plotly’\n", | |
"\n", | |
"The following object is masked _by_ ‘.GlobalEnv’:\n", | |
"\n", | |
" subplot\n", | |
"\n", | |
"The following object is masked from ‘package:ggplot2’:\n", | |
"\n", | |
" last_plot\n", | |
"\n", | |
"The following object is masked from ‘package:stats’:\n", | |
"\n", | |
" filter\n", | |
"\n", | |
"The following object is masked from ‘package:graphics’:\n", | |
"\n", | |
" layout\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"R version 3.3.1 (2016-06-21)\n", | |
"Platform: x86_64-pc-linux-gnu (64-bit)\n", | |
"Running under: Ubuntu 14.04.5 LTS\n", | |
"\n", | |
"locale:\n", | |
" [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C \n", | |
" [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 \n", | |
" [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 \n", | |
" [7] LC_PAPER=en_US.UTF-8 LC_NAME=C \n", | |
" [9] LC_ADDRESS=C LC_TELEPHONE=C \n", | |
"[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C \n", | |
"\n", | |
"attached base packages:\n", | |
"[1] grid stats graphics grDevices utils datasets methods \n", | |
"[8] base \n", | |
"\n", | |
"other attached packages:\n", | |
" [1] Cairo_1.5-9 plotly_4.5.6 tsne_0.1-3 tidyr_0.6.0 \n", | |
" [5] htmlwidgets_0.8 stringr_1.0.0 digest_0.6.9 RColorBrewer_1.1-2\n", | |
" [9] scales_0.4.1 extrafont_0.17 ggplot2_2.2.0 dplyr_0.5.0 \n", | |
"[13] readr_1.0.0 \n", | |
"\n", | |
"loaded via a namespace (and not attached):\n", | |
" [1] Rcpp_0.12.7 plyr_1.8.4 base64enc_0.1-3 tools_3.3.1 \n", | |
" [5] uuid_0.1-2 viridisLite_0.1.3 jsonlite_0.9.22 evaluate_0.9 \n", | |
" [9] memoise_1.0.0 tibble_1.2 gtable_0.2.0 IRdisplay_0.4.3 \n", | |
"[13] DBI_0.5-1 IRkernel_0.7 Rttf2pt1_1.3.4 repr_0.7 \n", | |
"[17] httr_1.2.1 R6_2.1.2 pbdZMQ_0.2-3 purrr_0.2.2 \n", | |
"[21] extrafontdb_1.0 magrittr_1.5 htmltools_0.3.5 assertthat_0.1 \n", | |
"[25] colorspace_1.2-7 stringi_1.1.1 lazyeval_0.2.0 munsell_0.4.3 \n", | |
"[29] crayon_1.3.1 " | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"options(warn=1)\n", | |
"\n", | |
"source(\"Rstart.R\")\n", | |
"\n", | |
"\n", | |
"library(htmlwidgets)\n", | |
"library(tidyr)\n", | |
"library(tsne)\n", | |
"library(plotly)\n", | |
"library(Cairo)\n", | |
"\n", | |
"sessionInfo()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Reading CSV File fb_headlines_53D.csv and Get headers." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Parsed with column specification:\n", | |
"cols(\n", | |
" headline_text = col_character(),\n", | |
" words = col_character(),\n", | |
" vectors = col_character()\n", | |
")\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"# A tibble: 6 × 3\n", | |
" headline_text\n", | |
" <chr>\n", | |
"1 पूर्व न्यायाधीश काटजू द्वारा महात्मा गांधी और सुभाष चन्द्र बोस के बारे में की गई\n", | |
"2 प्रदेश में जिला पंचायत अध्यक्षों तथा क्षेत्र पंचायत अध्यक्षों का चुनाव जनता द्वा\n", | |
"3 श्रमिकों के स्वास्थ्य जीवन सुरक्षा एवं सामाजिक सुरक्षा के लिए सरकार की विशेष कार\n", | |
"4 पूर्व न्यायाधीश काटजू द्वारा महात्मा गांधी और सुभाष चन्द्र बोस के बारे में की गय\n", | |
"5 जेलों में बन्दियों से काम कराये जाने को मजदूरी सामान्य मजदूरी से कम होने से न्यू\n", | |
"6 प्रदेश में पंचायत चुनाव को ध्यान में रखते हुए नये ग्राम पंचायतों के गठन की सरकार\n", | |
"# ... with 2 more variables: words <chr>, vectors <chr>\n" | |
] | |
} | |
], | |
"source": [ | |
"df <- read_csv('up_headlines_53D.csv')\n", | |
"\n", | |
"df %>% head() %>% print()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Trim Vectors" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[1] \"w2v_1\" \"w2v_2\" \"w2v_3\" \"w2v_4\" \"w2v_5\" \"w2v_6\"\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"'0.0328820116793, 0.0153017712659, -0.0768153635834, -0.0781457485175, 0.0856339146236, 0.0806637675452, 0.016138706969, -0.0416410333434, -0.0260888401627, 0.0773809610707, 0.000783105723952, -0.0178328938997, -0.0555987699435, -0.106906409522, -0.0456301614544, 0.00785618393045, 0.0798873163968, -0.0414245682523, 0.0495745174856, 0.0870355924861, -0.0164662080474, -0.00790620378929, 0.0603307797819, 0.0203287915915, -0.112791778765, 0.0923950764056, 0.0157565759299, 0.0564356347926, -0.0518590922614, 0.0368148758762, 0.0106712522867, -0.0284038266706, -0.0851529049442, 0.0387461587395, 0.0270565854827, -0.0408039603774, 0.0257963363669, 0.00687891808584, 0.0478950645775, -0.0304270914731, 0.0369687206357, -0.03236692438, 0.0880861730364, -0.0360604567747, 0.00944715514044, -0.00866862892565, 0.0712618977812, 0.0362615068688, -0.0087818749445, 0.00939701870084'" | |
], | |
"text/latex": [ | |
"'0.0328820116793, 0.0153017712659, -0.0768153635834, -0.0781457485175, 0.0856339146236, 0.0806637675452, 0.016138706969, -0.0416410333434, -0.0260888401627, 0.0773809610707, 0.000783105723952, -0.0178328938997, -0.0555987699435, -0.106906409522, -0.0456301614544, 0.00785618393045, 0.0798873163968, -0.0414245682523, 0.0495745174856, 0.0870355924861, -0.0164662080474, -0.00790620378929, 0.0603307797819, 0.0203287915915, -0.112791778765, 0.0923950764056, 0.0157565759299, 0.0564356347926, -0.0518590922614, 0.0368148758762, 0.0106712522867, -0.0284038266706, -0.0851529049442, 0.0387461587395, 0.0270565854827, -0.0408039603774, 0.0257963363669, 0.00687891808584, 0.0478950645775, -0.0304270914731, 0.0369687206357, -0.03236692438, 0.0880861730364, -0.0360604567747, 0.00944715514044, -0.00866862892565, 0.0712618977812, 0.0362615068688, -0.0087818749445, 0.00939701870084'" | |
], | |
"text/markdown": [ | |
"'0.0328820116793, 0.0153017712659, -0.0768153635834, -0.0781457485175, 0.0856339146236, 0.0806637675452, 0.016138706969, -0.0416410333434, -0.0260888401627, 0.0773809610707, 0.000783105723952, -0.0178328938997, -0.0555987699435, -0.106906409522, -0.0456301614544, 0.00785618393045, 0.0798873163968, -0.0414245682523, 0.0495745174856, 0.0870355924861, -0.0164662080474, -0.00790620378929, 0.0603307797819, 0.0203287915915, -0.112791778765, 0.0923950764056, 0.0157565759299, 0.0564356347926, -0.0518590922614, 0.0368148758762, 0.0106712522867, -0.0284038266706, -0.0851529049442, 0.0387461587395, 0.0270565854827, -0.0408039603774, 0.0257963363669, 0.00687891808584, 0.0478950645775, -0.0304270914731, 0.0369687206357, -0.03236692438, 0.0880861730364, -0.0360604567747, 0.00944715514044, -0.00866862892565, 0.0712618977812, 0.0362615068688, -0.0087818749445, 0.00939701870084'" | |
], | |
"text/plain": [ | |
"[1] \"0.0328820116793, 0.0153017712659, -0.0768153635834, -0.0781457485175, 0.0856339146236, 0.0806637675452, 0.016138706969, -0.0416410333434, -0.0260888401627, 0.0773809610707, 0.000783105723952, -0.0178328938997, -0.0555987699435, -0.106906409522, -0.0456301614544, 0.00785618393045, 0.0798873163968, -0.0414245682523, 0.0495745174856, 0.0870355924861, -0.0164662080474, -0.00790620378929, 0.0603307797819, 0.0203287915915, -0.112791778765, 0.0923950764056, 0.0157565759299, 0.0564356347926, -0.0518590922614, 0.0368148758762, 0.0106712522867, -0.0284038266706, -0.0851529049442, 0.0387461587395, 0.0270565854827, -0.0408039603774, 0.0257963363669, 0.00687891808584, 0.0478950645775, -0.0304270914731, 0.0369687206357, -0.03236692438, 0.0880861730364, -0.0360604567747, 0.00944715514044, -0.00866862892565, 0.0712618977812, 0.0362615068688, -0.0087818749445, 0.00939701870084\"" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"vector_names = paste0('w2v_', 1:50)\n", | |
"\n", | |
"vector_trim <- function(vector)\n", | |
" substr(vector, 2, nchar(vector)-1)\n", | |
"\n", | |
"vector_names %>% head() %>% print()\n", | |
"vector_trim(df$vectors[1])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Apply trim operation on records in merge vectors col in DF" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"# A tibble: 6 × 4\n", | |
" w2v_1 w2v_2 w2v_3 w2v_4\n", | |
" <dbl> <dbl> <dbl> <dbl>\n", | |
"1 0.032882012 0.015301771 -0.07681536 -0.07814575\n", | |
"2 -0.009557094 0.025808075 -0.10414318 -0.10744891\n", | |
"3 0.013315650 0.003658768 -0.12385208 -0.16951320\n", | |
"4 0.021004910 0.002648041 -0.06290056 -0.06146964\n", | |
"5 0.075372205 -0.033468563 -0.21310763 -0.05953282\n", | |
"6 0.062007089 -0.023262943 -0.07187693 -0.15581162\n" | |
] | |
} | |
], | |
"source": [ | |
"df$vectors = lapply(df$vectors, vector_trim)\n", | |
"\n", | |
"df <- separate(data = df, col = vectors, into = vector_names, convert=T, sep = \",\")\n", | |
"\n", | |
"df %>% select(w2v_1:w2v_4) %>% head() %>% print()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Apply tsne to visualize high-dimensional data by giving each datapoint a location in a two or three-dimensional map." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"sigma summary: Min. : 0.3649 |1st Qu. : 0.5092 |Median : 0.5744 |Mean : 0.6471 |3rd Qu. : 0.7147 |Max. : 1.37 |\n", | |
"Epoch: Iteration #50 error is: 22.7436992247251\n", | |
"Epoch: Iteration #100 error is: 22.6745722156027\n", | |
"Epoch: Iteration #150 error is: 2.77797369977655\n", | |
"Epoch: Iteration #200 error is: 2.47160629606748\n", | |
"Epoch: Iteration #250 error is: 2.32516467252177\n", | |
"Epoch: Iteration #300 error is: 2.14250491538372\n", | |
"Epoch: Iteration #350 error is: 2.04176627940595\n", | |
"Epoch: Iteration #400 error is: 1.97915906112453\n", | |
"Epoch: Iteration #450 error is: 1.93541624348131\n", | |
"Epoch: Iteration #500 error is: 1.90270492165228\n", | |
"Epoch: Iteration #550 error is: 1.87714411796289\n", | |
"Epoch: Iteration #600 error is: 1.85647890054709\n", | |
"Epoch: Iteration #650 error is: 1.83933720260265\n", | |
"Epoch: Iteration #700 error is: 1.82489199991234\n", | |
"Epoch: Iteration #750 error is: 1.81243942643664\n", | |
"Epoch: Iteration #800 error is: 1.80161654143028\n", | |
"Epoch: Iteration #850 error is: 1.7920762079765\n", | |
"Epoch: Iteration #900 error is: 1.78359013680289\n", | |
"Epoch: Iteration #950 error is: 1.77602530904231\n", | |
"Epoch: Iteration #1000 error is: 1.76929908375223\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
" user system elapsed \n", | |
"17104.159 345.141 17479.279 " | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"matrix <- df %>% select(w2v_1:w2v_50) %>% as.matrix()\n", | |
"\n", | |
"system.time( cluster_coords <- tsne(matrix, initial_dims=50, perplexity=50, epoch=50) )" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"8.2 Hours!" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" [,1] [,2]\n", | |
"[1,] 12.40071 -25.76720\n", | |
"[2,] -79.72086 -23.69962\n", | |
"[3,] -31.07426 -46.22413\n", | |
"[4,] 12.40379 -25.72588\n", | |
"[5,] -42.21251 1.05750\n", | |
"[6,] -64.40599 8.00892\n" | |
] | |
} | |
], | |
"source": [ | |
"cluster_coords %>% head() %>% print()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"# A tibble: 6 × 3\n", | |
" headline_text\n", | |
" <chr>\n", | |
"1 पूर्व न्यायाधीश काटजू द्वारा महात्मा गांधी और सुभाष चन्द्र बोस के बारे में की गई\n", | |
"2 प्रदेश में जिला पंचायत अध्यक्षों तथा क्षेत्र पंचायत अध्यक्षों का चुनाव जनता द्वा\n", | |
"3 श्रमिकों के स्वास्थ्य जीवन सुरक्षा एवं सामाजिक सुरक्षा के लिए सरकार की विशेष कार\n", | |
"4 पूर्व न्यायाधीश काटजू द्वारा महात्मा गांधी और सुभाष चन्द्र बोस के बारे में की गय\n", | |
"5 जेलों में बन्दियों से काम कराये जाने को मजदूरी सामान्य मजदूरी से कम होने से न्यू\n", | |
"6 प्रदेश में पंचायत चुनाव को ध्यान में रखते हुए नये ग्राम पंचायतों के गठन की सरकार\n", | |
"# ... with 2 more variables: x <dbl>, y <dbl>\n" | |
] | |
} | |
], | |
"source": [ | |
"df_transform = df %>% select(headline_text,words) %>%\n", | |
" mutate(x = cluster_coords[,1], y= cluster_coords[,2])\n", | |
"\n", | |
"df_transform %>% select(headline_text, x, y) %>% head() %>% print()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Step 3: Csv with 2D co-ordinates\n", | |
"[[ go to the top ]](#Table-of-contents)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": true | |
}, | |
"source": [ | |
"#### Write these co-ordinates into a csv file for plotting" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"write.csv(df_transform, \"df_transform_vidhan_sabha_50D.csv\", row.names=F)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Parsed with column specification:\n", | |
"cols(\n", | |
" headline_text = col_character(),\n", | |
" words = col_character(),\n", | |
" x = col_double(),\n", | |
" y = col_double()\n", | |
")\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"# A tibble: 7,914 × 52\n", | |
" headline_text\n", | |
"* <chr>\n", | |
"1 पूर्व न्यायाधीश काटजू द्वारा महात्मा गांधी और सुभाष चन्द्र बोस के बारे में की गई\n", | |
"2 प्रदेश में जिला पंचायत अध्यक्षों तथा क्षेत्र पंचायत अध्यक्षों का चुनाव जनता द्वा\n", | |
"3 श्रमिकों के स्वास्थ्य जीवन सुरक्षा एवं सामाजिक सुरक्षा के लिए सरकार की विशेष कार\n", | |
"4 पूर्व न्यायाधीश काटजू द्वारा महात्मा गांधी और सुभाष चन्द्र बोस के बारे में की गय\n", | |
"5 जेलों में बन्दियों से काम कराये जाने को मजदूरी सामान्य मजदूरी से कम होने से न्यू\n", | |
"6 प्रदेश में पंचायत चुनाव को ध्यान में रखते हुए नये ग्राम पंचायतों के गठन की सरकार\n", | |
"7 प्रदेश में डीजल, पेट्रोल कीमत में कमी होने के कारण बसो व ट्रकों के वे किराये की \n", | |
"8 मेरठ सहित प्रदेश में बांट-माप विभाग द्वारा छापेमारी, नियमित जांच न होने से पेट\n", | |
"9 मेडिकल कालेज में चिकित्सा छात्रों के ज्ञान हेतु एन.आई.टी. विभाग में कैजेवर (शव)\n", | |
"10 प्रदेश में एम्स (आल इण्डिया इन्स्टीट्यूट आफ मेडिकल साइंस) खोलने के लिए केन्द्र स\n", | |
"# ... with 7,904 more rows, and 51 more variables: words <chr>, w2v_1 <dbl>,\n", | |
"# w2v_2 <dbl>, w2v_3 <dbl>, w2v_4 <dbl>, w2v_5 <dbl>, w2v_6 <dbl>,\n", | |
"# w2v_7 <dbl>, w2v_8 <dbl>, w2v_9 <dbl>, w2v_10 <dbl>, w2v_11 <dbl>,\n", | |
"# w2v_12 <dbl>, w2v_13 <dbl>, w2v_14 <dbl>, w2v_15 <dbl>, w2v_16 <dbl>,\n", | |
"# w2v_17 <dbl>, w2v_18 <dbl>, w2v_19 <dbl>, w2v_20 <dbl>, w2v_21 <dbl>,\n", | |
"# w2v_22 <dbl>, w2v_23 <dbl>, w2v_24 <dbl>, w2v_25 <dbl>, w2v_26 <dbl>,\n", | |
"# w2v_27 <dbl>, w2v_28 <dbl>, w2v_29 <dbl>, w2v_30 <dbl>, w2v_31 <dbl>,\n", | |
"# w2v_32 <dbl>, w2v_33 <dbl>, w2v_34 <dbl>, w2v_35 <dbl>, w2v_36 <dbl>,\n", | |
"# w2v_37 <dbl>, w2v_38 <dbl>, w2v_39 <dbl>, w2v_40 <dbl>, w2v_41 <dbl>,\n", | |
"# w2v_42 <dbl>, w2v_43 <dbl>, w2v_44 <dbl>, w2v_45 <dbl>, w2v_46 <dbl>,\n", | |
"# w2v_47 <dbl>, w2v_48 <dbl>, w2v_49 <dbl>, w2v_50 <dbl>\n" | |
] | |
} | |
], | |
"source": [ | |
"df_53D = read_csv(\"df_transform_vidhan_sabha_50D.csv\")\n", | |
"print(df)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Step 4: Plot the samples\n", | |
"[[ go to the top ]](#Table-of-contents)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Plot using ggplot2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Parsed with column specification:\n", | |
"cols(\n", | |
" headline_text = col_character(),\n", | |
" words = col_character(),\n", | |
" x = col_double(),\n", | |
" y = col_double()\n", | |
")\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"# A tibble: 6 × 3\n", | |
" headline_text x y\n", | |
" <chr> <dbl> <dbl>\n", | |
"1 पूर्व न्यायाधीश काटज 12.40071 -25.76720\n", | |
"2 प्रदेश में जिला पंचा -79.72086 -23.69962\n", | |
"3 श्रमिकों के स्वास्थ् -31.07426 -46.22413\n", | |
"4 पूर्व न्यायाधीश काटज 12.40379 -25.72588\n", | |
"5 जेलों में बन्दियों स -42.21251 1.05750\n", | |
"6 प्रदेश में पंचायत चु -64.40599 8.00892\n" | |
] | |
} | |
], | |
"source": [ | |
"df_plot <- read_csv(\"df_transform_vidhan_sabha_50D.csv\")\n", | |
"\n", | |
"df_plot %>% select(headline_text, x, y) %>% mutate(headline_text = substr(headline_text,1,20)) %>% head() %>% print()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"plot <- ggplot(df_plot, aes(x=x, y=y)) +\n", | |
" geom_point(alpha=1, stroke=0) + \n", | |
" theme_bw()\n", | |
"\n", | |
"ggsave(\"vidhan-sabha-headlines-cluster-test-53D.png\", plot, width=4, height=3, dpi=300)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The left side of the 2D representation represents the more serious headlines, while the right side represents the more silly headlines. 2. There is a little overlap between the NYTimes/CNN/BuzzFeed articles." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Prototype using plotly's scattergl" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"p <- plot_ly(df_plot,\n", | |
" x = df_plot$x,\n", | |
" y = df_plot$y,\n", | |
" type = \"scattergl\",\n", | |
" mode = \"markers\",\n", | |
" marker = list(line = list(width = 0), opacity=0.75, size=6),\n", | |
" text=df_plot$headline_text)\n", | |
"\n", | |
"createWidget(name=\"plotly\",x=plotly_build(p), sizingPolicy=sizingPolicy(browser.padding = 0, \n", | |
" browser.fill = F, defaultWidth = \"100%\", defaultHeight = 400)) %>%\n", | |
"saveWidget(\"up-vidhansabha-headlines-cluster-test-53D.html\", selfcontained=T, libdir=\"plotly\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": true | |
}, | |
"source": [ | |
"#### Generate custom text for tooltips (note: this was not used since it made charts harder to read)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"processText <- function(row) {\n", | |
" sprintf(\"%s<br>%s Reactions<br>%s\",\n", | |
" row[3],\n", | |
" format(as.numeric(row[5]), big.mark=\",\"),\n", | |
" format(as.Date(substr(row[4], 1, 10) ), format = \"%B %d, %Y\" )) \n", | |
" }\n", | |
"\n", | |
"apply(df_plot[1,], 1, processText)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df_plot$text = apply(df_plot, 1, processText)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Plot the real plotly chart, with layout options to remove the axes." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# https://plot.ly/r/axes/\n", | |
"\n", | |
"ax <- list(\n", | |
" title = \"\",\n", | |
" zeroline = FALSE,\n", | |
" showline = FALSE,\n", | |
" showticklabels = FALSE,\n", | |
" showgrid = FALSE\n", | |
")\n", | |
"\n", | |
"m = list(\n", | |
" l = 0,\n", | |
" r = 0,\n", | |
" b = 0,\n", | |
" t = 25,\n", | |
" pad = 0\n", | |
")\n", | |
"\n", | |
"\n", | |
"p <- plot_ly(df_plot,\n", | |
" x = df_plot$x,\n", | |
" y = df_plot$y,\n", | |
" type = \"scattergl\",\n", | |
" mode = \"markers\",\n", | |
" marker = list(line = list(width = 0), opacity=0.75, size=6),\n", | |
" text= df_plot$headline_text,\n", | |
" hoverinfo=\"text+name\") %>% layout(xaxis = ax, yaxis = ax, margin=m)\n", | |
"\n", | |
"createWidget(name=\"plotly\",x=plotly_build(p), sizingPolicy=sizingPolicy(browser.padding = 0, \n", | |
" browser.fill = F, defaultWidth = \"100%\", defaultHeight = 400)) %>%\n", | |
"\n", | |
"saveWidget(\"up-vidhan-sabha-cluster-standalone.html\", selfcontained=T, libdir=\"plotly\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Tweak plot slightly" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"font <- list(\n", | |
" family='Source Sans Pro, Arial, sans-serif'\n", | |
" )\n", | |
"\n", | |
"p <- plot_ly(df_plot,\n", | |
" x = df_plot$x,\n", | |
" y = df_plot$y,\n", | |
" type = \"scattergl\",\n", | |
" mode = \"markers\",\n", | |
" marker = list(line = list(width = 0), opacity=0.75, size=6),\n", | |
" text= df_plot$headline_text,\n", | |
" hoverinfo=\"text+name\") %>% layout(xaxis = ax,\n", | |
" yaxis = ax,\n", | |
" margin=m,\n", | |
" font=font,\n", | |
" plot_bgcolor ='#f7f8fa',\n", | |
" paper_bgcolor='#f7f8fa')\n", | |
"\n", | |
"createWidget(name=\"plotly\",x=plotly_build(p), sizingPolicy=sizingPolicy(browser.padding = 0, \n", | |
" browser.fill = F, defaultWidth = \"100%\", defaultHeight = 400)) %>%\n", | |
"saveWidget(\"up-vidhan-sabha-headlines-cluster-web.html\", selfcontained=T, libdir=\"plotly\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "R", | |
"language": "R", | |
"name": "ir" | |
}, | |
"language_info": { | |
"codemirror_mode": "r", | |
"file_extension": ".r", | |
"mimetype": "text/x-r-source", | |
"name": "R", | |
"pygments_lexer": "r", | |
"version": "3.3.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment