Last active
December 18, 2015 15:58
-
-
Save ColCarroll/5807955 to your computer and use it in GitHub Desktop.
A demonstration for pulling data from a wikipedia table and cleaning the specific highest mountain table.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tryAsInteger = function(node) { | |
val = xmlValue(node) | |
ans = as.integer(gsub(",", "", val)) | |
if(is.na(ans)) | |
val | |
else | |
ans | |
} | |
scrapeTable <- function(theurl = "http://en.wikipedia.org/wiki/List_of_highest_mountains"){ | |
require(XML) | |
html <- htmlParse(theurl) | |
tables <<- readHTMLTable(html, stringsAsFactors = FALSE, elFun = tryAsInteger) | |
tables <- tables[sapply(tables, function(x) !is.null(x))] | |
id <<- order(unlist(lapply(tables, function(t) dim(t)[1]))) | |
bigTable <<- tables[[tail(id,n=1)]] | |
} | |
fixMountainTable <- function(mtnTable){ | |
names(mtnTable) <- c("Rank","Mountain","Height.meters","Height.feet","Range","Coordinates","Prominence","ParentMountain","FirstAscent","AscentsBefore2004") | |
sapply(mtnTable,mode) | |
nums <- c("Rank","Height.meters","Height.feet","Prominence","FirstAscent") | |
mtnTable[,nums] <- sapply(mtnTable[,nums],as.numeric) | |
return(mtnTable) | |
} | |
meetingExamples <- function(){ | |
# Some useful commands from the meeting today. Many of these are meant to be run | |
# interactively, so just calling the function won't look that impressive. If | |
# you want to just run the function, you can call | |
# > source('june18.R') | |
# > meetingExamples() | |
require(scales) | |
require(ggplot2) #This has the diamonds dataset. ??diamonds will find it in there | |
head(diamonds) # Print the first 6 rows of data. Use optional second argument to print more (see ?head) | |
summary(diamonds) # Summarize the diamond data set | |
diamonds$cut # Three ways to access the "cut" column of the diamonds data | |
diamonds[,2] | |
diamonds[,"cut"] | |
four_cs <- diamonds[,c("carat","cut","color","clarity")] #creating a new data frame with only 4 columns | |
four_cs <- diamonds[,c(1,2,3,4)] #same | |
four_cs <- diamonds[,1:4] #same | |
four_cs <- diamonds[,c(-5,-6,-7,-8,-9,-10)] # same. Minus means "not this row" | |
nsamples <- 10000 | |
train.data <<- diamonds[sample(1:nrow(diamonds),nsamples),] #select 5000 points randomly from diamonds | |
# Below is an example of a plot you might produce, using some methods I showed and some I didn't. Remember | |
# that ?method.name is your friend. Online documentation is also very good. | |
p <<- ggplot(train.data, aes(x = carat, y = price)) #Base ggplot2 object. I do not include color here because I do | |
# not want a line for each color | |
p <- p + geom_point(alpha = I(0.2), aes(color = color), position = 'jitter') # Color aesthetic is added at the geom level | |
p <- p + geom_smooth() #Fits a polynomial to data | |
p <- p + facet_grid(cut~clarity) # Split the data by categorical variables | |
p <- p + scale_x_log10(name = "Carats", breaks = c(0.5, 1, 2)) # Formatting the x-axis with log scaling | |
p <- p + scale_y_continuous(name = "Price", label = dollar) # Using the "scales" package to format the y-axis | |
p <- p + scale_size_continuous(name = "Depth (mm)") # You can also set legend attributes | |
p <- p + ggtitle(sprintf("Carats vs price in a sample of %s diamonds",nsamples)) # String formatting in R | |
p <- p + theme_minimal() # Some prebuilt themes to make your plots look great | |
png("caratsvsprice.png",width = 1000, height = 1000) # This will save the plot | |
print(p) | |
dev.off() | |
#Another plot will generate a density plot for the data. It doesn't exactly fit, but it is an interesting | |
#way to look at the data and shows off some other geoms. This uses ..level.. as the height, but you could | |
#equally well supply a z vector that recorded counts | |
p <- ggplot(train.data, aes(x = carat, y = price)) | |
p <- p + geom_point(position = 'jitter',alpha = I(0.3)) # Underlying data set | |
p <- p + geom_density2d(na.rm = TRUE,bins = 20, size = I(0.5), alpha = I(0.5), aes(color = ..level..)) #Minor contours | |
p <- p + geom_density2d(na.rm = TRUE,bins = 4, size = I(2), aes(color = ..level..)) # Major contours | |
p <- p + scale_x_log10(name = "Carats", breaks = c(0.5, 1, 2)) | |
p <- p + scale_y_log10(name = "Price", label = dollar) | |
p <- p + scale_color_continuous(name = "Data point density") | |
p <- p + ggtitle("Density of carats vs. price") | |
png("contourplot.png",width = 1000, height = 1000) # This will save the plot | |
print(p) | |
dev.off() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment