Last active
September 28, 2018 06:52
-
-
Save beemyfriend/47479890f82f0c80e3a247aebada51d1 to your computer and use it in GitHub Desktop.
Mining CCS 2018 Presentation Data and Creating. a Graph
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(rvest) | |
library(igraph) | |
########## | |
## Pull Data from website | |
########## | |
html <- read_html("http://ccs2018.web.auth.gr/schedule") | |
############# | |
## Pull Data from talk rows and tracks selection widget | |
## Thanks to selector gadget for pulling the css identifier | |
############# | |
tracks <- html_nodes(html, '#edit-field-session-tid') %>% | |
html_children() %>% | |
html_text() %>% | |
.[2:length(.)] %>% | |
c('All Tracks') %>% | |
str_replace_all('\\(', '\\\\(') %>% | |
str_replace_all('\\)', '\\\\)') %>% | |
str_replace('Complexity in Pyshics and Chemistry', 'Complexity in Physics and Chemistry') | |
talks <- html_nodes(html, ".views-align-center") | |
############ | |
## change '\\C'to '\\\\C' to avoid regex issues | |
############ | |
talk_title <- talks %>% | |
html_children %>% | |
html_text %>% | |
str_replace_all('\\(', '\\\\(') %>% | |
str_replace_all('\\)', '\\\\)') | |
############# | |
## Rooms are consistently the last thing provided in text | |
############# | |
talk_rooms <- talks %>% | |
html_text() %>% | |
str_extract('Room.+$') %>% | |
str_replace('\\s*$', '') %>% | |
{ | |
ifelse(is.na(.), '', .) | |
} | |
################## | |
## Extract text that matches an option from the selection dropdown | |
################## | |
talk_track <- talks %>% | |
html_text() %>% | |
map_chr(function(x){ | |
track <- tracks[str_detect(x, tracks)] | |
ifelse(identical(track, character(0)), '', track) | |
}) | |
################## | |
## Remove all extracted text. Anything left over is a name | |
################### | |
talks_speakers <- talks %>% | |
html_text %>% | |
str_replace('^\\s+', '') %>% | |
imap_chr(function(x, i){ | |
speakers <- x | |
if(talk_track[i] != '') speakers <- str_replace(speakers, talk_track[i], '') | |
if(talk_title[i] != '') speakers <- str_replace(speakers, talk_title[i], '') | |
if(talk_rooms[i] != '') speakers <- str_replace(speakers, talk_rooms[i], '') | |
str_trim(speakers) | |
}) | |
############## | |
## Create a daa frame with all extracted information | |
############## | |
ccsTalks <- tibble( | |
title = talk_title, | |
track = talk_track, | |
rooms = talk_rooms, | |
speakers = talks_speakers | |
) %>% | |
filter(speakers != '') %>% | |
mutate(speakers = str_split(speakers, ',| and ')) %>% | |
unnest() %>% | |
mutate(speakers = str_trim(speakers)) | |
################## | |
## Create edge list by joining data frame to itself by track | |
################### | |
ccsEL <- ccsTalks %>% | |
select(from = speakers, track) %>% | |
left_join(select(ccsTalks, to = speakers, track)) %>% | |
filter(from < to) %>% | |
group_by(from, to) %>% | |
summarize( | |
n = n(), | |
track = ifelse(n > 1, 'multiple', track) | |
) | |
################# | |
## Node List of authors/presenters with attribute being the author's track | |
################## | |
ccsNL <- ccsTalks %>% | |
select(speakers, track) %>% | |
group_by(speakers) %>% | |
summarize( | |
track = ifelse(n() > 1, 'multiple', track) | |
) | |
#################### | |
## Create a color scale matching topic | |
#################### | |
colorVertices <- ccsNL$track %>% | |
unique %>% | |
imap(function(x, i){ | |
temp <- i | |
names(temp) <- x | |
temp | |
}) %>% | |
do.call(c, .) | |
##################### | |
## Create graph | |
#################### | |
set.seed(4321) | |
ccsG <- graph_from_data_frame(ccsEL, F, ccsNL) %>% | |
set_edge_attr('width', value = .1) %>% | |
set_vertex_attr('size', value = 5) %>% | |
set_vertex_attr("label", value = '') %>% | |
set_vertex_attr("color", value = colorVertices[V(.)$track] + 1) %>% | |
set_graph_attr('layout', value = layout_nicely(.)) | |
plot(ccsG, main = "Speakers Connected by Tracks") | |
################### | |
#### Same as previous, but edges are people and nodes are tracks | |
################### | |
ccsTracksEL <- ccsTalks %>% | |
select(from = track, speakers) %>% | |
left_join(select(ccsTalks, speakers, to = track)) %>% | |
filter(from < to) %>% | |
group_by(from, to) %>% | |
summarize( | |
n = n() | |
) | |
############## | |
## Node size will be the number of authors in a track | |
############## | |
ccsTracksNL <- ccsTalks %>% | |
select(speakers, track) %>% | |
group_by(track) %>% | |
summarize( | |
n = n() | |
) | |
set.seed(4321) | |
ccsTracksG <- graph_from_data_frame(ccsTracksEL, F, ccsTracksNL) %>% | |
set_edge_attr('width', value = 5 * E(.)$n / max(E(.)$n)) %>% | |
set_vertex_attr('size', value = V(.)$n/max(V(.)$n)*30) %>% | |
set_vertex_attr("label", value = V(.)$name) %>% | |
set_vertex_attr("color", value = colorVertices[V(.)$name] + 1) %>% | |
set_vertex_attr("label.cex", value = .8) %>% | |
set_graph_attr('layout', value = layout_nicely(.)) | |
plot(ccsTracksG, main = "Tracks Connected by Speakers") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment