AdamSpannbauer · November 24, 2017 18:39
diff --git a/lexrank_doc_map.R b/lexrank_doc_map.R
 ##################################################################
 # ADDRESSING https://github.com/AdamSpannbauer/lexRankr/issues/8
 ##################################################################

 # GET EXAMPLE DATA
 #----------------------------------------------------------
 library(xml2)
 library(rvest)
 options(stringsAsFactors = FALSE)

 #two urls with stories from cnn.com
 urls = c("http://money.cnn.com/2017/11/20/technology/google-pixel-buds-review/index.html",
         "http://money.cnn.com/2017/11/23/technology/battlesgrounds-game-tencent-china/index.html")

 #css selector to get story text
 selector = c("#storytext p , .speakable")

 #iterate over url list indices
 my_df_list = lapply(seq_along(urls), function(i) {
  #get url i
  u = urls[i]
  #read page
  raw_html   = xml2::read_html(u)
  #extract text with selector
  story_text = rvest::html_nodes(raw_html, selector)
  #drop html tags
  text_lines = rvest::html_text(story_text)
  
  #put in df with id info
  df_out = data.frame(doc_id = i, url = u, text = text_lines)
  return(df_out)
 })

 #combine into single df
 my_df = do.call('rbind', my_df_list)
 #----------------------------------------------------------

 # POSSIBLE TIDYVERSE SOLUTION TO ISSUE USING `purrr::map()`
 #----------------------------------------------------------
 library(dplyr)
 library(purrr)
 #convet to tibble
 my_tbl = as_data_frame(my_df)

 #function to get top lexranked sentence in a df
 get_top_sentences = function(df_in, text_col = "text", n=1) {
  #perform piped lexrank process and extract top ranked sentence
  lex_df = lexRankr::unnest_sentences_(df_in, "sentences", text_col) %>% #parse sentences
    lexRankr::bind_lexrank(sentences, sent_id, level = "sentences") %>% #perform lexrank
    arrange(desc(lexrank)) %>% #get top ranked sentence(s)
    slice(1:n)
  return(lex_df)
 }

 #get top sentence(s) per document
 #split into a list with document dfs as elements
 top_sent_df = split(my_tbl, my_tbl$doc_id) %>% 
  #apply lexrank function to extract top n ranked sentences
  map(get_top_sentences, n=1) %>% 
  #recombine into single df
  bind_rows()
 #----------------------------------------------------------

 # OUTPUT
 #----------------------------------------------------------
 top_sent_df$sentences
 # [1] " But when Google (GOOG) announced its new Pixel Buds in October, 
 #       touting the ability to translate a conversation between different 
 #       languages in near real time, it promised something unique."         
 # [2] " Chinese tech giant Tencent (TCEHY) has announced plans to 
 #       distribute PlayerUnknown's \"Battlegrounds\" in its home market 
 #       after modifying the violent game to comply with \"socialist core 
 #       values.\" "
 top_sent_df$lexrank
 # [1] 0.06505914 0.07053404
 #----------------------------------------------------------
	##################################################################
	# ADDRESSING https://github.com/AdamSpannbauer/lexRankr/issues/8
	##################################################################

	# GET EXAMPLE DATA
	#----------------------------------------------------------
	library(xml2)
	library(rvest)
	options(stringsAsFactors = FALSE)

	#two urls with stories from cnn.com
	urls = c("http://money.cnn.com/2017/11/20/technology/google-pixel-buds-review/index.html",
	"http://money.cnn.com/2017/11/23/technology/battlesgrounds-game-tencent-china/index.html")

	#css selector to get story text
	selector = c("#storytext p , .speakable")

	#iterate over url list indices
	my_df_list = lapply(seq_along(urls), function(i) {
	#get url i
	u = urls[i]
	#read page
	raw_html = xml2::read_html(u)
	#extract text with selector
	story_text = rvest::html_nodes(raw_html, selector)
	#drop html tags
	text_lines = rvest::html_text(story_text)

	#put in df with id info
	df_out = data.frame(doc_id = i, url = u, text = text_lines)
	return(df_out)
	})

	#combine into single df
	my_df = do.call('rbind', my_df_list)
	#----------------------------------------------------------

	# POSSIBLE TIDYVERSE SOLUTION TO ISSUE USING `purrr::map()`
	#----------------------------------------------------------
	library(dplyr)
	library(purrr)
	#convet to tibble
	my_tbl = as_data_frame(my_df)

	#function to get top lexranked sentence in a df
	get_top_sentences = function(df_in, text_col = "text", n=1) {
	#perform piped lexrank process and extract top ranked sentence
	lex_df = lexRankr::unnest_sentences_(df_in, "sentences", text_col) %>% #parse sentences
	lexRankr::bind_lexrank(sentences, sent_id, level = "sentences") %>% #perform lexrank
	arrange(desc(lexrank)) %>% #get top ranked sentence(s)
	slice(1:n)
	return(lex_df)
	}

	#get top sentence(s) per document
	#split into a list with document dfs as elements
	top_sent_df = split(my_tbl, my_tbl$doc_id) %>%
	#apply lexrank function to extract top n ranked sentences
	map(get_top_sentences, n=1) %>%
	#recombine into single df
	bind_rows()
	#----------------------------------------------------------

	# OUTPUT
	#----------------------------------------------------------
	top_sent_df$sentences
	# [1] " But when Google (GOOG) announced its new Pixel Buds in October,
	# touting the ability to translate a conversation between different
	# languages in near real time, it promised something unique."
	# [2] " Chinese tech giant Tencent (TCEHY) has announced plans to
	# distribute PlayerUnknown's \"Battlegrounds\" in its home market
	# after modifying the violent game to comply with \"socialist core
	# values.\" "
	top_sent_df$lexrank
	# [1] 0.06505914 0.07053404
	#----------------------------------------------------------