Created
November 24, 2017 18:39
-
-
Save AdamSpannbauer/020d694182602073de27de01408e0509 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################## | |
# ADDRESSING https://github.com/AdamSpannbauer/lexRankr/issues/8 | |
################################################################## | |
# GET EXAMPLE DATA | |
#---------------------------------------------------------- | |
library(xml2) | |
library(rvest) | |
options(stringsAsFactors = FALSE) | |
#two urls with stories from cnn.com | |
urls = c("http://money.cnn.com/2017/11/20/technology/google-pixel-buds-review/index.html", | |
"http://money.cnn.com/2017/11/23/technology/battlesgrounds-game-tencent-china/index.html") | |
#css selector to get story text | |
selector = c("#storytext p , .speakable") | |
#iterate over url list indices | |
my_df_list = lapply(seq_along(urls), function(i) { | |
#get url i | |
u = urls[i] | |
#read page | |
raw_html = xml2::read_html(u) | |
#extract text with selector | |
story_text = rvest::html_nodes(raw_html, selector) | |
#drop html tags | |
text_lines = rvest::html_text(story_text) | |
#put in df with id info | |
df_out = data.frame(doc_id = i, url = u, text = text_lines) | |
return(df_out) | |
}) | |
#combine into single df | |
my_df = do.call('rbind', my_df_list) | |
#---------------------------------------------------------- | |
# POSSIBLE TIDYVERSE SOLUTION TO ISSUE USING `purrr::map()` | |
#---------------------------------------------------------- | |
library(dplyr) | |
library(purrr) | |
#convet to tibble | |
my_tbl = as_data_frame(my_df) | |
#function to get top lexranked sentence in a df | |
get_top_sentences = function(df_in, text_col = "text", n=1) { | |
#perform piped lexrank process and extract top ranked sentence | |
lex_df = lexRankr::unnest_sentences_(df_in, "sentences", text_col) %>% #parse sentences | |
lexRankr::bind_lexrank(sentences, sent_id, level = "sentences") %>% #perform lexrank | |
arrange(desc(lexrank)) %>% #get top ranked sentence(s) | |
slice(1:n) | |
return(lex_df) | |
} | |
#get top sentence(s) per document | |
#split into a list with document dfs as elements | |
top_sent_df = split(my_tbl, my_tbl$doc_id) %>% | |
#apply lexrank function to extract top n ranked sentences | |
map(get_top_sentences, n=1) %>% | |
#recombine into single df | |
bind_rows() | |
#---------------------------------------------------------- | |
# OUTPUT | |
#---------------------------------------------------------- | |
top_sent_df$sentences | |
# [1] " But when Google (GOOG) announced its new Pixel Buds in October, | |
# touting the ability to translate a conversation between different | |
# languages in near real time, it promised something unique." | |
# [2] " Chinese tech giant Tencent (TCEHY) has announced plans to | |
# distribute PlayerUnknown's \"Battlegrounds\" in its home market | |
# after modifying the violent game to comply with \"socialist core | |
# values.\" " | |
top_sent_df$lexrank | |
# [1] 0.06505914 0.07053404 | |
#---------------------------------------------------------- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment