Skip to content

Instantly share code, notes, and snippets.

@AdamSpannbauer
Created November 25, 2017 20:33
Show Gist options
  • Save AdamSpannbauer/85f4196f1deba95fce1087deae7104b8 to your computer and use it in GitHub Desktop.
Save AdamSpannbauer/85f4196f1deba95fce1087deae7104b8 to your computer and use it in GitHub Desktop.
##################################################################
# ADDRESSING https://github.com/AdamSpannbauer/lexRankr/issues/8
##################################################################
# Monduiz'S EXAMPLE CODE
##################################################################
library(rvest)
library(tidyverse)
library(stringr)
library(purrr)
library(lexRankr)
gm_headlines <- read_html("https://beta.theglobeandmail.com/politics/")
gm_links <- gm_headlines %>%
html_nodes(".o-card__link") %>%
html_attr("href") %>%
xml2::url_absolute("https://beta.theglobeandmail.com")
pages <- gm_links %>% map(read_html)
gm_articles <- pages %>%
map(. %>%
html_nodes(".c-article-body__text") %>%
html_text()
)
gm_titles <- gm_headlines %>%
html_nodes('.o-card__content-text') %>%
html_text
gm <- data_frame(gm_titles, gm_links, gm_articles)
# Remove duplicates and video links
gm <- gm %>%
distinct(gm_titles, .keep_all = TRUE) %>%
filter(!str_detect(gm_links, 'video')) %>%
mutate(doc_id = 1:length(gm_articles))
### summarization
gm_unnest <- gm %>%
select(doc_id, gm_articles) %>%
unnest(gm_articles)
#------------------------------------------------------------
# MODIFICATION TO GET TOP LEXRANK PER DOC
##################################################################
#function to get top lexranked sentence in a df
get_top_sentences = function(df_in, text_col = "text", n=1) {
#perform piped lexrank process and extract top ranked sentence
lex_df = lexRankr::unnest_sentences_(df_in, "sentences", text_col) %>% #parse sentences
lexRankr::bind_lexrank(sentences, sent_id, level = "sentences") %>% #perform lexrank
arrange(desc(lexrank)) %>% #get top ranked sentence(s)
slice(1:n)
return(lex_df)
}
#add some try catch logic around the lexrank custom function
safe_top_sent = purrr::possibly(get_top_sentences, otherwise = NULL, quiet = FALSE)
#get top sentence(s) per document
#split into a list with document dfs as elements
gm_rank_doc_level = split(gm_unnest, gm_unnest$doc_id) %>%
#apply lexrank function to extract top n ranked sentences
map(safe_top_sent, text_col="gm_articles", n=2) %>%
#recombine into single df
bind_rows()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment