primaryobjects · June 27, 2014 17:55
diff --git a/linkedInLinksPlus.R b/linkedInLinksPlus.R
 require(XML)
 require(stringr)

 #
 # Grabs all names and LinkedIn urls from a Coursera forum thread. Perfect for the "Let's Connect!" threads.
 # Usage:
 # 1. Open a Coursera forum thread, containing LinkedIn links.
 # 2. Scroll all the way to the bottom of the page to load all posts in the thread.
 # 3. Save the web page to an html file named post.htm.
 # 4. Call linkedInLinks("post.htm")
 # 5. To save the result to a file: write.csv(linkedInLinks("post.htm"), "links.csv", quote=FALSE, row.names=FALSE)
 #
 linkedInLinks <- function(fileName) {
    # Read html page from file.
    data <- htmlTreeParse(fileName, useInternalNodes = T)
    
    # Find all posts containing a LinkedIn url.
    posts <- xpathApply(data, "//div[@class='course-forum-post-view-container' and .//a[contains(@href, 'linkedin')]]")
    
    # Cast each post to an xml document, so that we can parse it.
    postElements <- sapply(posts, function(post) xmlDoc(post))
    
    # Get the author a href elements.
    authorElements <- unlist(sapply(postElements, function(postElement) xpathApply(postElement, "//a[contains(@href, 'forum/profile')]")))
                             
    # Get the author names from each element.
    authors <- sapply(authorElements, xmlValue)
    
    # Find all linkedin links in the comments.
    commentElements <- unlist(sapply(postElements, function(postElement) xpathApply(postElement, "//div[@class='course-forum-post-text']//a[contains(@href, 'linkedin')]")))
    
    # Get the href value for each link.
    links <- sapply(commentElements, function(el) xmlGetAttr(el, "href"))
    
    # Return result.
    data.frame(Name = authors, Url = links)
 }
	require(XML)
	require(stringr)

	#
	# Grabs all names and LinkedIn urls from a Coursera forum thread. Perfect for the "Let's Connect!" threads.
	# Usage:
	# 1. Open a Coursera forum thread, containing LinkedIn links.
	# 2. Scroll all the way to the bottom of the page to load all posts in the thread.
	# 3. Save the web page to an html file named post.htm.
	# 4. Call linkedInLinks("post.htm")
	# 5. To save the result to a file: write.csv(linkedInLinks("post.htm"), "links.csv", quote=FALSE, row.names=FALSE)
	#
	linkedInLinks <- function(fileName) {
	# Read html page from file.
	data <- htmlTreeParse(fileName, useInternalNodes = T)

	# Find all posts containing a LinkedIn url.
	posts <- xpathApply(data, "//div[@class='course-forum-post-view-container' and .//a[contains(@href, 'linkedin')]]")

	# Cast each post to an xml document, so that we can parse it.
	postElements <- sapply(posts, function(post) xmlDoc(post))

	# Get the author a href elements.
	authorElements <- unlist(sapply(postElements, function(postElement) xpathApply(postElement, "//a[contains(@href, 'forum/profile')]")))

	# Get the author names from each element.
	authors <- sapply(authorElements, xmlValue)

	# Find all linkedin links in the comments.
	commentElements <- unlist(sapply(postElements, function(postElement) xpathApply(postElement, "//div[@class='course-forum-post-text']//a[contains(@href, 'linkedin')]")))

	# Get the href value for each link.
	links <- sapply(commentElements, function(el) xmlGetAttr(el, "href"))

	# Return result.
	data.frame(Name = authors, Url = links)
	}