AdamSpannbauer · November 25, 2017 20:33
diff --git a/mod_Monduiz_code.R b/mod_Monduiz_code.R
 ##################################################################
 # ADDRESSING https://github.com/AdamSpannbauer/lexRankr/issues/8
 ##################################################################

 # Monduiz'S EXAMPLE CODE
 ##################################################################
 library(rvest)
 library(tidyverse)
 library(stringr)
 library(purrr)
 library(lexRankr)

 gm_headlines <- read_html("https://beta.theglobeandmail.com/politics/")

 gm_links <- gm_headlines %>%
  html_nodes(".o-card__link") %>%
  html_attr("href") %>% 
  xml2::url_absolute("https://beta.theglobeandmail.com")

 pages <- gm_links %>% map(read_html)

 gm_articles <- pages %>% 
  map(. %>% 
        html_nodes(".c-article-body__text") %>% 
        html_text()
  )

 gm_titles <- gm_headlines %>%
  html_nodes('.o-card__content-text') %>%
  html_text

 gm <- data_frame(gm_titles, gm_links, gm_articles)

 # Remove duplicates and video links
 gm <- gm %>% 
  distinct(gm_titles, .keep_all = TRUE) %>% 
  filter(!str_detect(gm_links, 'video')) %>%
  mutate(doc_id = 1:length(gm_articles))


 ### summarization
 gm_unnest <- gm %>% 
  select(doc_id, gm_articles) %>% 
  unnest(gm_articles)
 #------------------------------------------------------------


 # MODIFICATION TO GET TOP LEXRANK PER DOC
 ##################################################################

 #function to get top lexranked sentence in a df
 get_top_sentences = function(df_in, text_col = "text", n=1) {
  #perform piped lexrank process and extract top ranked sentence
  lex_df = lexRankr::unnest_sentences_(df_in, "sentences", text_col) %>% #parse sentences
    lexRankr::bind_lexrank(sentences, sent_id, level = "sentences") %>% #perform lexrank
    arrange(desc(lexrank)) %>% #get top ranked sentence(s)
    slice(1:n)
  return(lex_df)
 }

 #add some try catch logic around the lexrank custom function
 safe_top_sent = purrr::possibly(get_top_sentences, otherwise = NULL, quiet = FALSE)
 #get top sentence(s) per document
 #split into a list with document dfs as elements
 gm_rank_doc_level = split(gm_unnest, gm_unnest$doc_id) %>% 
  #apply lexrank function to extract top n ranked sentences
  map(safe_top_sent, text_col="gm_articles", n=2) %>% 
  #recombine into single df
  bind_rows()
	##################################################################
	# ADDRESSING https://github.com/AdamSpannbauer/lexRankr/issues/8
	##################################################################

	# Monduiz'S EXAMPLE CODE
	##################################################################
	library(rvest)
	library(tidyverse)
	library(stringr)
	library(purrr)
	library(lexRankr)

	gm_headlines <- read_html("https://beta.theglobeandmail.com/politics/")

	gm_links <- gm_headlines %>%
	html_nodes(".o-card__link") %>%
	html_attr("href") %>%
	xml2::url_absolute("https://beta.theglobeandmail.com")

	pages <- gm_links %>% map(read_html)

	gm_articles <- pages %>%
	map(. %>%
	html_nodes(".c-article-body__text") %>%
	html_text()
	)

	gm_titles <- gm_headlines %>%
	html_nodes('.o-card__content-text') %>%
	html_text

	gm <- data_frame(gm_titles, gm_links, gm_articles)

	# Remove duplicates and video links
	gm <- gm %>%
	distinct(gm_titles, .keep_all = TRUE) %>%
	filter(!str_detect(gm_links, 'video')) %>%
	mutate(doc_id = 1:length(gm_articles))


	### summarization
	gm_unnest <- gm %>%
	select(doc_id, gm_articles) %>%
	unnest(gm_articles)
	#------------------------------------------------------------


	# MODIFICATION TO GET TOP LEXRANK PER DOC
	##################################################################

	#function to get top lexranked sentence in a df
	get_top_sentences = function(df_in, text_col = "text", n=1) {
	#perform piped lexrank process and extract top ranked sentence
	lex_df = lexRankr::unnest_sentences_(df_in, "sentences", text_col) %>% #parse sentences
	lexRankr::bind_lexrank(sentences, sent_id, level = "sentences") %>% #perform lexrank
	arrange(desc(lexrank)) %>% #get top ranked sentence(s)
	slice(1:n)
	return(lex_df)
	}

	#add some try catch logic around the lexrank custom function
	safe_top_sent = purrr::possibly(get_top_sentences, otherwise = NULL, quiet = FALSE)
	#get top sentence(s) per document
	#split into a list with document dfs as elements
	gm_rank_doc_level = split(gm_unnest, gm_unnest$doc_id) %>%
	#apply lexrank function to extract top n ranked sentences
	map(safe_top_sent, text_col="gm_articles", n=2) %>%
	#recombine into single df
	bind_rows()