Created
December 21, 2017 11:18
-
-
Save AdamSpannbauer/03d599fa1aa8076f9981e55163a1df12 to your computer and use it in GitHub Desktop.
Addressing Werner DeJuan's comment on https://adamspannbauer.github.io/2017/12/17/summarizing-web-articles-with-r/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#load needed packages | |
library(xml2) | |
library(rvest) | |
library(lexRankr) | |
#url to scrape | |
my_url = "http://www.freepatentsonline.com/y2014/0278285.html" | |
#read page html | |
page = xml2::read_html(my_url) | |
#extract text from page html using selector | |
page_text = rvest::html_text(rvest::html_nodes(page, "p")) | |
#perform lexrank for top 3 sentences | |
top_3 = lexRankr::lexRank(page_text, | |
#only 1 article; repeat same docid for all of input vector | |
docId = rep(1, length(page_text)), | |
#return 3 sentences to mimick /u/autotldr's output | |
n = 3, | |
continuous = TRUE) | |
#reorder the top 3 sentences to be in order of appearance in article | |
order_of_appearance = order(as.integer(gsub("_","",top_3$sentenceId))) | |
#extract sentences in order of appearance | |
ordered_top_3 = top_3[order_of_appearance, "sentence"] | |
ordered_top_3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment