Skip to content

Instantly share code, notes, and snippets.

@milesgrimshaw
Last active December 12, 2015 08:59
Show Gist options
  • Save milesgrimshaw/4748004 to your computer and use it in GitHub Desktop.
Save milesgrimshaw/4748004 to your computer and use it in GitHub Desktop.
Who Are My Peers? The code I wrote to both learn R and analyze some descriptive qualities of the current Yale undergraduate student body( Classes '13, '14, '15, '16).
#Pull in the html
x <- scan("~/Dropbox/Documents/Senior Year/Spring/STAT230/YaleFacebook3.html",
what="", sep="\n")
#All the student data is within the div class='display_data'
data <- grep('display_data', x, fixed=TRUE)
#The student data is all on the next line of html. The studens variable now contains the
#line number of the html with all the data
students <- x[data+1]
#each student's info is contained within the div with class='student_text_container'
#so use this to seperate each student into their own index in the array
a <- unlist(strsplit(students, 'student_text_container', fixed = TRUE,
useBytes = TRUE))
#I further break apart the html
b <- unlist(strsplit(a, 'div', fixed = TRUE, useBytes = TRUE))
c <- unlist(strsplit(b, '<br>', fixed = TRUE, useBytes = TRUE))
#Functions
getmonthcount <- function(index) {
month <- monthslist[index]
array <- grep(month,c,fixed=TRUE)
count <- length(array)
return(count)
}
getcollegecount <- function(index) {
college <- collegelist[index]
array <- grep(college,c,fixed=TRUE)
count <- length(array)
return(count)
}
getmajorcount <- function(index) {
major <- majorslist[index]
array <- grep(major,c,fixed=TRUE)
count <- length(array)
return(count)
}
#Find out how many people have each birthday
monthslist <- c("Jan","Feb","Mar","Apr","May","Jun",
"Jul","Aug","Sep","Oct","Nov","Dec")
months <- data.frame(months= monthslist, count=rep(NA,length(monthslist)))
for (i in 0:(length(monthslist)-1)) months[(i+1),2] <- getmonthcount(i+1)
#Export Graph
png(file="birthdays.png",width=800,height=600)
par(mar=c(5,6,4,2)+0.1)
barplot(months$count, names.arg=months$months, xlim = NULL, cex.names=0.9,
ylab="Students", xlab="Birthday", main="Number of Yale Students By Month of Birthday")
dev.off()
#Find out how many people are in each Yale College
collegelist <- c("Berkeley College","Branford College","Calhoun College",
"Davenport College","Ezra Stiles College",
"Jonathan Edwards College", "Morse College","Pierson College",
"Saybrook College","Silliman College","Timothy Dwight College",
"Trumbull College")
colleges <- data.frame(college= collegelist, count=rep(NA,length(collegelist)))
for (i in 0:(length(collegelist)-1)) colleges[(i+1),2] <- getcollegecount(i+1)
#To have a simpler chart we want to simplify the college names
#Need the stringr library in order to do this
library("stringr")
collegelistforprint <- str_replace(collegelist, " College","")
collegelistforprint <- str_replace(collegelistforprint, "Ezra Stiles", "Stiles")
collegelistforprint <- str_replace(collegelistforprint, "Jonathan Edwards","JE")
collegelistforprint <- str_replace(collegelistforprint, "Timothy Dwight","TD")
#Plot the number of students by college
png(file="colleges.png",width=800,height=600)
par(mar=c(5,6,4,2)+0.1)
barplot(colleges$count, names.arg=collegelistforprint, xlim = NULL, cex.names=0.8,
xlab="College", ylab="Students", main="Number of Students By Residential College")
dev.off()
#An array of undergraduate majors.
majorslist <- c(
"African American Studies",
"African Studies",
"American Studies",
"Anthropology",
"Applied Mathematics",
"Applied Physics",
"Archaeological Studies",
"Architecture",
"Art",
"Astronomy",
"Astronomy & Physics",
"Biology",
"Chemistry",
"Chinese",
"Classical Civilization",
"Classics",
"Cognitive Science",
"Computer Science",
"Computer Science & Mathematics",
"Computer Science & Psychology",
"Computing and the Arts",
"East Asian Studies",
"Ecology & Evolutionary Biology",
"Economics",
"Economics & Mathematics",
"Electrical Engineering",
"Engineering",
"English",
"Environmental Studies",
"Ethics,Politics & Economics",
"Ethnicity, Race & Migration",
"Film Studies",
"French",
"Geology & Geophysics",
"German",
"German Studies",
"Global Affairs",
"History",
"History of Art",
"Humanities",
"Italian",
"Latin American Studies",
"Linguistics",
"Literature",
"Mathematics",
"Mathematics & Philosophy",
"Mathematics & Physics",
"Modern Middle Eastern Studies",
"Molecular Biophysics & Biochem",
"Molecular,Cellular,Dev Biology",
"Music",
"Near Eastern Languages & Civs",
"Philosophy",
"Physics",
"Physics & Philosophy",
"Political Science",
"Psychology",
"Religious Studies",
"Russian",
"Sociology",
"Spanish",
"Statistics",
"Theater Studies",
"Women'sGender&SexualityStudies",
"Undeclared")
#Number of students for each major
majors <- data.frame(majors = majorslist, count=rep(NA,length(majorslist)))
for (i in 0:(length(majorslist)-1)) majors[(i+1),2] <- getmajorcount(i+1)
#Plot Students by Major
png(file="allmajors.png",width=800,height=1200)
par(las=2)
par(mar=c(5,15,4,1)+0.1)
barplot(majors$count, names.arg=majorslist, space=0.8,
horiz=TRUE, xlim = NULL, cex.names=1,
xlab="Number of Students", main="Numer Of Students With Each Declared Major")
dev.off()
##Look at the most popular majors
mostmajors <- subset(majors, count > 50)
mostmajors <- subset(mostmajors, majors != "Undeclared")
#Graph Most Popular Majors
png(file="mostpopularmajors.png",width=800,height=600)
par(las=2)
par(mar=c(5,13,4,2)+0.1)
barplot(mostmajors$count, names.arg=mostmajors$majors, space=0.5, horiz=TRUE, xlim = NULL, cex.names=0.9,
xlab="Number of Students", main="The Most Popular Majors at Yale")
dev.off()
##Look at the least common majors
fewmajors <- subset(majors,count <30)
#Graph Least Popular Majors
png(file="leastpopularmajors.png",width=800,height=900)
par(las=2)
par(mar=c(5,13,4,2)+0.1)
barplot(fewmajors$count, names.arg=fewmajors$majors, space=0.7, horiz=TRUE, xlim = NULL, cex.names=0.9,
xlab="Number of Students", main="The Least Popular Majors at Yale")
dev.off()
#Goal: the number of majors by college
#create data frame with majors in left column
majorsbycollege <- data.frame(major=mostmajors$majors)
#Run through each college to get the number of studnets with the most popular
#majors per college
for (i in 1:(length(collegelist)))
{
collegetocheck <- collegelist[i]
collegeheader <- collegelistforprint[i]
#Add a Column & Header for that college to the matrix
majorsbycollege[collegeheader] <- NA
#go through each major to check the count for the college currently assessing
#j is the row (the major)
for (j in 1:(length(mostmajors$majors)))
{
majortocheck <- majorsbycollege[j,1]
#array of locations for students with the given major
studentswithmajor <- grep(majortocheck, a, fixed=TRUE)
#array of locations for students with the given college
studentswithcollege <- grep(collegetocheck,a,fixed=TRUE)
occurances = as.integer(0)
#check for the number of matches:
occurancelocations <- grep(TRUE, studentswithmajor %in% studentswithcollege)
occurances <- occurances + as.integer(length(occurancelocations))
majorsbycollege[j,(i+1)] <- occurances
}
}
#Engineering by College
engineering <- grep("Engineering",majorsbycollege[,1])
png(file="engineering.png",width=800,height=600)
par(mar=c(5,6,4,2)+0.1)
barplot(as.integer(majorsbycollege[engineering,2:(length(collegelist)+1)]), names.arg=collegelistforprint,
cex.names=0.8, xlab="College", main="Number of Engineering Majors by College")
dev.off()
#Computer Science by College
computer <- grep("Computer Science", majorsbycollege[,1])
png(file="compsci.png",width=800,height=600)
par(mar=c(5,6,4,2)+0.1)
barplot(as.integer(majorsbycollege[computer,2:(length(collegelist)+1)]), names.arg=collegelistforprint,
cex.names=0.8, xlab="College", main="Number of Computer Science Majors by College")
dev.off()
#Economics by College
economics <- grep("Economics", majorsbycollege[,1])
png(file="econ.png",width=800,height=600)
par(mar=c(5,6,4,2)+0.1)
#I have to use economics[1] because economics[2] is the row for Ethics, Politics & Econ
barplot(as.integer(majorsbycollege[economics[1],2:(length(collegelist)+1)]), names.arg=collegelistforprint,
cex.names=0.8, xlab="College", main="Number of Economics Majors by College")
dev.off()
#English by College
english <- grep("English", majorsbycollege[,1])
png(file="english.png",width=800,height=600)
par(mar=c(5,6,4,2)+0.1)
barplot(as.integer(majorsbycollege[english,2:(length(collegelist)+1)]), names.arg=collegelistforprint,
cex.names=0.8, xlab="College", main="Number of English Majors by College")
dev.off()
#Physics by College
physics <- grep("Physics", majorsbycollege[,1])
png(file="physics.png",width=800,height=600)
par(mar=c(5,6,4,2)+0.1)
barplot(as.integer(majorsbycollege[physics,2:(length(collegelist)+1)]), names.arg=collegelistforprint,
cex.names=0.8, xlab="College", main="Number of Physics Majors by College")
dev.off()
#Mathematics by College
mathematics <- grep("Mathematics", majorsbycollege[,1])
png(file="mathematics.png",width=800,height=600)
par(mar=c(5,6,4,2)+0.1)
barplot(as.integer(majorsbycollege[mathematics,2:(length(collegelist)+1)]), names.arg=collegelistforprint,
cex.names=0.8, xlab="College", main="Number of Mathematics Majors by College")
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment