Created
June 27, 2014 17:55
-
-
Save primaryobjects/906e79b836163a66b0cc to your computer and use it in GitHub Desktop.
Grabs all LinkedIn urls from a Coursera forum thread. Perfect for the "Let's Connect!" threads.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(XML) | |
require(stringr) | |
# | |
# Grabs all names and LinkedIn urls from a Coursera forum thread. Perfect for the "Let's Connect!" threads. | |
# Usage: | |
# 1. Open a Coursera forum thread, containing LinkedIn links. | |
# 2. Scroll all the way to the bottom of the page to load all posts in the thread. | |
# 3. Save the web page to an html file named post.htm. | |
# 4. Call linkedInLinks("post.htm") | |
# 5. To save the result to a file: write.csv(linkedInLinks("post.htm"), "links.csv", quote=FALSE, row.names=FALSE) | |
# | |
linkedInLinks <- function(fileName) { | |
# Read html page from file. | |
data <- htmlTreeParse(fileName, useInternalNodes = T) | |
# Find all posts containing a LinkedIn url. | |
posts <- xpathApply(data, "//div[@class='course-forum-post-view-container' and .//a[contains(@href, 'linkedin')]]") | |
# Cast each post to an xml document, so that we can parse it. | |
postElements <- sapply(posts, function(post) xmlDoc(post)) | |
# Get the author a href elements. | |
authorElements <- unlist(sapply(postElements, function(postElement) xpathApply(postElement, "//a[contains(@href, 'forum/profile')]"))) | |
# Get the author names from each element. | |
authors <- sapply(authorElements, xmlValue) | |
# Find all linkedin links in the comments. | |
commentElements <- unlist(sapply(postElements, function(postElement) xpathApply(postElement, "//div[@class='course-forum-post-text']//a[contains(@href, 'linkedin')]"))) | |
# Get the href value for each link. | |
links <- sapply(commentElements, function(el) xmlGetAttr(el, "href")) | |
# Return result. | |
data.frame(Name = authors, Url = links) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment