Last active
December 12, 2015 08:59
-
-
Save milesgrimshaw/4748004 to your computer and use it in GitHub Desktop.
Who Are My Peers? The code I wrote to both learn R and analyze some descriptive qualities of the current Yale undergraduate student body( Classes '13, '14, '15, '16).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Pull in the html | |
x <- scan("~/Dropbox/Documents/Senior Year/Spring/STAT230/YaleFacebook3.html", | |
what="", sep="\n") | |
#All the student data is within the div class='display_data' | |
data <- grep('display_data', x, fixed=TRUE) | |
#The student data is all on the next line of html. The studens variable now contains the | |
#line number of the html with all the data | |
students <- x[data+1] | |
#each student's info is contained within the div with class='student_text_container' | |
#so use this to seperate each student into their own index in the array | |
a <- unlist(strsplit(students, 'student_text_container', fixed = TRUE, | |
useBytes = TRUE)) | |
#I further break apart the html | |
b <- unlist(strsplit(a, 'div', fixed = TRUE, useBytes = TRUE)) | |
c <- unlist(strsplit(b, '<br>', fixed = TRUE, useBytes = TRUE)) | |
#Functions | |
getmonthcount <- function(index) { | |
month <- monthslist[index] | |
array <- grep(month,c,fixed=TRUE) | |
count <- length(array) | |
return(count) | |
} | |
getcollegecount <- function(index) { | |
college <- collegelist[index] | |
array <- grep(college,c,fixed=TRUE) | |
count <- length(array) | |
return(count) | |
} | |
getmajorcount <- function(index) { | |
major <- majorslist[index] | |
array <- grep(major,c,fixed=TRUE) | |
count <- length(array) | |
return(count) | |
} | |
#Find out how many people have each birthday | |
monthslist <- c("Jan","Feb","Mar","Apr","May","Jun", | |
"Jul","Aug","Sep","Oct","Nov","Dec") | |
months <- data.frame(months= monthslist, count=rep(NA,length(monthslist))) | |
for (i in 0:(length(monthslist)-1)) months[(i+1),2] <- getmonthcount(i+1) | |
#Export Graph | |
png(file="birthdays.png",width=800,height=600) | |
par(mar=c(5,6,4,2)+0.1) | |
barplot(months$count, names.arg=months$months, xlim = NULL, cex.names=0.9, | |
ylab="Students", xlab="Birthday", main="Number of Yale Students By Month of Birthday") | |
dev.off() | |
#Find out how many people are in each Yale College | |
collegelist <- c("Berkeley College","Branford College","Calhoun College", | |
"Davenport College","Ezra Stiles College", | |
"Jonathan Edwards College", "Morse College","Pierson College", | |
"Saybrook College","Silliman College","Timothy Dwight College", | |
"Trumbull College") | |
colleges <- data.frame(college= collegelist, count=rep(NA,length(collegelist))) | |
for (i in 0:(length(collegelist)-1)) colleges[(i+1),2] <- getcollegecount(i+1) | |
#To have a simpler chart we want to simplify the college names | |
#Need the stringr library in order to do this | |
library("stringr") | |
collegelistforprint <- str_replace(collegelist, " College","") | |
collegelistforprint <- str_replace(collegelistforprint, "Ezra Stiles", "Stiles") | |
collegelistforprint <- str_replace(collegelistforprint, "Jonathan Edwards","JE") | |
collegelistforprint <- str_replace(collegelistforprint, "Timothy Dwight","TD") | |
#Plot the number of students by college | |
png(file="colleges.png",width=800,height=600) | |
par(mar=c(5,6,4,2)+0.1) | |
barplot(colleges$count, names.arg=collegelistforprint, xlim = NULL, cex.names=0.8, | |
xlab="College", ylab="Students", main="Number of Students By Residential College") | |
dev.off() | |
#An array of undergraduate majors. | |
majorslist <- c( | |
"African American Studies", | |
"African Studies", | |
"American Studies", | |
"Anthropology", | |
"Applied Mathematics", | |
"Applied Physics", | |
"Archaeological Studies", | |
"Architecture", | |
"Art", | |
"Astronomy", | |
"Astronomy & Physics", | |
"Biology", | |
"Chemistry", | |
"Chinese", | |
"Classical Civilization", | |
"Classics", | |
"Cognitive Science", | |
"Computer Science", | |
"Computer Science & Mathematics", | |
"Computer Science & Psychology", | |
"Computing and the Arts", | |
"East Asian Studies", | |
"Ecology & Evolutionary Biology", | |
"Economics", | |
"Economics & Mathematics", | |
"Electrical Engineering", | |
"Engineering", | |
"English", | |
"Environmental Studies", | |
"Ethics,Politics & Economics", | |
"Ethnicity, Race & Migration", | |
"Film Studies", | |
"French", | |
"Geology & Geophysics", | |
"German", | |
"German Studies", | |
"Global Affairs", | |
"History", | |
"History of Art", | |
"Humanities", | |
"Italian", | |
"Latin American Studies", | |
"Linguistics", | |
"Literature", | |
"Mathematics", | |
"Mathematics & Philosophy", | |
"Mathematics & Physics", | |
"Modern Middle Eastern Studies", | |
"Molecular Biophysics & Biochem", | |
"Molecular,Cellular,Dev Biology", | |
"Music", | |
"Near Eastern Languages & Civs", | |
"Philosophy", | |
"Physics", | |
"Physics & Philosophy", | |
"Political Science", | |
"Psychology", | |
"Religious Studies", | |
"Russian", | |
"Sociology", | |
"Spanish", | |
"Statistics", | |
"Theater Studies", | |
"Women'sGender&SexualityStudies", | |
"Undeclared") | |
#Number of students for each major | |
majors <- data.frame(majors = majorslist, count=rep(NA,length(majorslist))) | |
for (i in 0:(length(majorslist)-1)) majors[(i+1),2] <- getmajorcount(i+1) | |
#Plot Students by Major | |
png(file="allmajors.png",width=800,height=1200) | |
par(las=2) | |
par(mar=c(5,15,4,1)+0.1) | |
barplot(majors$count, names.arg=majorslist, space=0.8, | |
horiz=TRUE, xlim = NULL, cex.names=1, | |
xlab="Number of Students", main="Numer Of Students With Each Declared Major") | |
dev.off() | |
##Look at the most popular majors | |
mostmajors <- subset(majors, count > 50) | |
mostmajors <- subset(mostmajors, majors != "Undeclared") | |
#Graph Most Popular Majors | |
png(file="mostpopularmajors.png",width=800,height=600) | |
par(las=2) | |
par(mar=c(5,13,4,2)+0.1) | |
barplot(mostmajors$count, names.arg=mostmajors$majors, space=0.5, horiz=TRUE, xlim = NULL, cex.names=0.9, | |
xlab="Number of Students", main="The Most Popular Majors at Yale") | |
dev.off() | |
##Look at the least common majors | |
fewmajors <- subset(majors,count <30) | |
#Graph Least Popular Majors | |
png(file="leastpopularmajors.png",width=800,height=900) | |
par(las=2) | |
par(mar=c(5,13,4,2)+0.1) | |
barplot(fewmajors$count, names.arg=fewmajors$majors, space=0.7, horiz=TRUE, xlim = NULL, cex.names=0.9, | |
xlab="Number of Students", main="The Least Popular Majors at Yale") | |
dev.off() | |
#Goal: the number of majors by college | |
#create data frame with majors in left column | |
majorsbycollege <- data.frame(major=mostmajors$majors) | |
#Run through each college to get the number of studnets with the most popular | |
#majors per college | |
for (i in 1:(length(collegelist))) | |
{ | |
collegetocheck <- collegelist[i] | |
collegeheader <- collegelistforprint[i] | |
#Add a Column & Header for that college to the matrix | |
majorsbycollege[collegeheader] <- NA | |
#go through each major to check the count for the college currently assessing | |
#j is the row (the major) | |
for (j in 1:(length(mostmajors$majors))) | |
{ | |
majortocheck <- majorsbycollege[j,1] | |
#array of locations for students with the given major | |
studentswithmajor <- grep(majortocheck, a, fixed=TRUE) | |
#array of locations for students with the given college | |
studentswithcollege <- grep(collegetocheck,a,fixed=TRUE) | |
occurances = as.integer(0) | |
#check for the number of matches: | |
occurancelocations <- grep(TRUE, studentswithmajor %in% studentswithcollege) | |
occurances <- occurances + as.integer(length(occurancelocations)) | |
majorsbycollege[j,(i+1)] <- occurances | |
} | |
} | |
#Engineering by College | |
engineering <- grep("Engineering",majorsbycollege[,1]) | |
png(file="engineering.png",width=800,height=600) | |
par(mar=c(5,6,4,2)+0.1) | |
barplot(as.integer(majorsbycollege[engineering,2:(length(collegelist)+1)]), names.arg=collegelistforprint, | |
cex.names=0.8, xlab="College", main="Number of Engineering Majors by College") | |
dev.off() | |
#Computer Science by College | |
computer <- grep("Computer Science", majorsbycollege[,1]) | |
png(file="compsci.png",width=800,height=600) | |
par(mar=c(5,6,4,2)+0.1) | |
barplot(as.integer(majorsbycollege[computer,2:(length(collegelist)+1)]), names.arg=collegelistforprint, | |
cex.names=0.8, xlab="College", main="Number of Computer Science Majors by College") | |
dev.off() | |
#Economics by College | |
economics <- grep("Economics", majorsbycollege[,1]) | |
png(file="econ.png",width=800,height=600) | |
par(mar=c(5,6,4,2)+0.1) | |
#I have to use economics[1] because economics[2] is the row for Ethics, Politics & Econ | |
barplot(as.integer(majorsbycollege[economics[1],2:(length(collegelist)+1)]), names.arg=collegelistforprint, | |
cex.names=0.8, xlab="College", main="Number of Economics Majors by College") | |
dev.off() | |
#English by College | |
english <- grep("English", majorsbycollege[,1]) | |
png(file="english.png",width=800,height=600) | |
par(mar=c(5,6,4,2)+0.1) | |
barplot(as.integer(majorsbycollege[english,2:(length(collegelist)+1)]), names.arg=collegelistforprint, | |
cex.names=0.8, xlab="College", main="Number of English Majors by College") | |
dev.off() | |
#Physics by College | |
physics <- grep("Physics", majorsbycollege[,1]) | |
png(file="physics.png",width=800,height=600) | |
par(mar=c(5,6,4,2)+0.1) | |
barplot(as.integer(majorsbycollege[physics,2:(length(collegelist)+1)]), names.arg=collegelistforprint, | |
cex.names=0.8, xlab="College", main="Number of Physics Majors by College") | |
dev.off() | |
#Mathematics by College | |
mathematics <- grep("Mathematics", majorsbycollege[,1]) | |
png(file="mathematics.png",width=800,height=600) | |
par(mar=c(5,6,4,2)+0.1) | |
barplot(as.integer(majorsbycollege[mathematics,2:(length(collegelist)+1)]), names.arg=collegelistforprint, | |
cex.names=0.8, xlab="College", main="Number of Mathematics Majors by College") | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment