milesgrimshaw · December 12, 2015 08:59
diff --git a/gistfile1.r b/gistfile1.r
 #Pull in the html
 x <- scan("~/Dropbox/Documents/Senior Year/Spring/STAT230/YaleFacebook3.html",
          what="", sep="\n")
 #All the student data is within the div class='display_data'
 data <- grep('display_data', x, fixed=TRUE)
 #The student data is all on the next line of html. The studens variable now contains the 
 #line number of the html with all the data
 students <- x[data+1]

 #each student's info is contained within the div with class='student_text_container'
 #so use this to seperate each student into their own index in the array
 a <- unlist(strsplit(students, 'student_text_container', fixed = TRUE, 
                     useBytes = TRUE))

 #I further break apart the html
 b <- unlist(strsplit(a, 'div', fixed = TRUE, useBytes = TRUE))
 c <- unlist(strsplit(b, '<br>', fixed = TRUE, useBytes = TRUE))

 #Functions
 getmonthcount <- function(index) {
  month <- monthslist[index]
  array <- grep(month,c,fixed=TRUE)
  count <- length(array)
  return(count)
 }
 getcollegecount <- function(index) {
  college <- collegelist[index]
  array <- grep(college,c,fixed=TRUE)
  count <- length(array)
  return(count)
 }

 getmajorcount <- function(index) {
  major <- majorslist[index]
  array <- grep(major,c,fixed=TRUE)
  count <- length(array)
  return(count)
 }

 #Find out how many people have each birthday
 monthslist <- c("Jan","Feb","Mar","Apr","May","Jun",
                "Jul","Aug","Sep","Oct","Nov","Dec")
 months <- data.frame(months= monthslist, count=rep(NA,length(monthslist)))
 for (i in 0:(length(monthslist)-1)) months[(i+1),2] <- getmonthcount(i+1)

 #Export Graph
 png(file="birthdays.png",width=800,height=600)
 par(mar=c(5,6,4,2)+0.1)
 barplot(months$count, names.arg=months$months, xlim = NULL, cex.names=0.9, 
        ylab="Students", xlab="Birthday", main="Number of Yale Students By Month of Birthday")
 dev.off()


 #Find out how many people are in each Yale College
 collegelist <- c("Berkeley College","Branford College","Calhoun College",
                 "Davenport College","Ezra Stiles College", 
                 "Jonathan Edwards College", "Morse College","Pierson College",
                 "Saybrook College","Silliman College","Timothy Dwight College",
                 "Trumbull College")
 colleges <- data.frame(college= collegelist, count=rep(NA,length(collegelist)))
 for (i in 0:(length(collegelist)-1)) colleges[(i+1),2] <- getcollegecount(i+1)

 #To have a simpler chart we want to simplify the college names
 #Need the stringr library in order to do this
 library("stringr")
 collegelistforprint <- str_replace(collegelist, " College","")
 collegelistforprint <- str_replace(collegelistforprint, "Ezra Stiles", "Stiles")
 collegelistforprint <- str_replace(collegelistforprint, "Jonathan Edwards","JE")
 collegelistforprint <- str_replace(collegelistforprint, "Timothy Dwight","TD")

 #Plot the number of students by college
 png(file="colleges.png",width=800,height=600)
 par(mar=c(5,6,4,2)+0.1)
 barplot(colleges$count, names.arg=collegelistforprint, xlim = NULL, cex.names=0.8,
        xlab="College", ylab="Students", main="Number of Students By Residential College")
 dev.off()

 #An array of undergraduate majors.
 majorslist <- c(
  "African American Studies",
  "African Studies",
  "American Studies",
  "Anthropology",
  "Applied Mathematics",
  "Applied Physics",
  "Archaeological Studies",
  "Architecture",
  "Art",
  "Astronomy",
  "Astronomy & Physics",
  "Biology",
  "Chemistry",
  "Chinese",
  "Classical Civilization",
  "Classics",
  "Cognitive Science",
  "Computer Science",
  "Computer Science & Mathematics",
  "Computer Science & Psychology",
  "Computing and the Arts",
  "East Asian Studies",
  "Ecology & Evolutionary Biology",
  "Economics",
  "Economics & Mathematics",
  "Electrical Engineering",
  "Engineering",
  "English",
  "Environmental Studies",
  "Ethics,Politics & Economics",
  "Ethnicity, Race & Migration",
  "Film Studies",
  "French",
  "Geology & Geophysics",
  "German",
  "German Studies",
  "Global Affairs",
  "History",
  "History of Art",
  "Humanities",
  "Italian",
  "Latin American Studies",
  "Linguistics",
  "Literature",
  "Mathematics",
  "Mathematics & Philosophy",
  "Mathematics & Physics",
  "Modern Middle Eastern Studies",
  "Molecular Biophysics & Biochem",
  "Molecular,Cellular,Dev Biology",
  "Music",
  "Near Eastern Languages & Civs",
  "Philosophy",
  "Physics",
  "Physics & Philosophy",
  "Political Science",
  "Psychology",
  "Religious Studies",
  "Russian",
  "Sociology",
  "Spanish",
  "Statistics",
  "Theater Studies",
  "Women'sGender&SexualityStudies",
  "Undeclared")

 #Number of students for each major
 majors <- data.frame(majors = majorslist, count=rep(NA,length(majorslist)))
 for (i in 0:(length(majorslist)-1)) majors[(i+1),2] <- getmajorcount(i+1)

 #Plot Students by Major
 png(file="allmajors.png",width=800,height=1200)
 par(las=2)
 par(mar=c(5,15,4,1)+0.1)
 barplot(majors$count, names.arg=majorslist, space=0.8, 
        horiz=TRUE, xlim = NULL, cex.names=1,
        xlab="Number of Students", main="Numer Of Students With Each Declared Major")
 dev.off()

 ##Look at the most popular majors
 mostmajors <- subset(majors, count > 50)
 mostmajors <- subset(mostmajors, majors != "Undeclared")

 #Graph Most Popular Majors
 png(file="mostpopularmajors.png",width=800,height=600)
 par(las=2)
 par(mar=c(5,13,4,2)+0.1)
 barplot(mostmajors$count, names.arg=mostmajors$majors, space=0.5, horiz=TRUE, xlim = NULL, cex.names=0.9,
        xlab="Number of Students", main="The Most Popular Majors at Yale")
 dev.off()

 ##Look at the least common majors
 fewmajors <- subset(majors,count <30)

 #Graph Least Popular Majors
 png(file="leastpopularmajors.png",width=800,height=900)
 par(las=2)
 par(mar=c(5,13,4,2)+0.1)
 barplot(fewmajors$count, names.arg=fewmajors$majors, space=0.7, horiz=TRUE, xlim = NULL, cex.names=0.9,
        xlab="Number of Students", main="The Least Popular Majors at Yale")
 dev.off()

 #Goal: the number of majors by college
 #create data frame with majors in left column
 majorsbycollege <- data.frame(major=mostmajors$majors)
 #Run through each college to get the number of studnets with the most popular
 #majors per college
 for (i in 1:(length(collegelist)))
 {
  collegetocheck <- collegelist[i]
  collegeheader <- collegelistforprint[i]
  #Add a Column & Header for that college to the matrix
  majorsbycollege[collegeheader] <- NA
  #go through each major to check the count for the college currently assessing
  #j is the row (the major)
  for (j in 1:(length(mostmajors$majors)))
  {
   majortocheck <- majorsbycollege[j,1]
   #array of locations for students with the given major
   studentswithmajor <- grep(majortocheck, a, fixed=TRUE)
   #array of locations for students with the given college
   studentswithcollege <- grep(collegetocheck,a,fixed=TRUE)
   occurances = as.integer(0)
   #check for the number of matches: 
   occurancelocations <- grep(TRUE, studentswithmajor %in% studentswithcollege)
   occurances <- occurances + as.integer(length(occurancelocations))
   majorsbycollege[j,(i+1)] <- occurances
  }
 }

 #Engineering by College
 engineering <- grep("Engineering",majorsbycollege[,1])
 png(file="engineering.png",width=800,height=600)
 par(mar=c(5,6,4,2)+0.1)
 barplot(as.integer(majorsbycollege[engineering,2:(length(collegelist)+1)]), names.arg=collegelistforprint,
        cex.names=0.8, xlab="College", main="Number of Engineering Majors by College")
 dev.off()

 #Computer Science by College
 computer <- grep("Computer Science", majorsbycollege[,1])
 png(file="compsci.png",width=800,height=600)
 par(mar=c(5,6,4,2)+0.1)
 barplot(as.integer(majorsbycollege[computer,2:(length(collegelist)+1)]), names.arg=collegelistforprint,
        cex.names=0.8, xlab="College", main="Number of Computer Science Majors by College")
 dev.off()

 #Economics by College
 economics <- grep("Economics", majorsbycollege[,1])
 png(file="econ.png",width=800,height=600)
 par(mar=c(5,6,4,2)+0.1)
 #I have to use economics[1] because economics[2] is the row for Ethics, Politics & Econ
 barplot(as.integer(majorsbycollege[economics[1],2:(length(collegelist)+1)]), names.arg=collegelistforprint,
        cex.names=0.8, xlab="College", main="Number of Economics Majors by College")
 dev.off()

 #English by College
 english <- grep("English", majorsbycollege[,1])
 png(file="english.png",width=800,height=600)
 par(mar=c(5,6,4,2)+0.1)
 barplot(as.integer(majorsbycollege[english,2:(length(collegelist)+1)]), names.arg=collegelistforprint,
        cex.names=0.8, xlab="College", main="Number of English Majors by College")
 dev.off()

 #Physics by College
 physics <- grep("Physics", majorsbycollege[,1])
 png(file="physics.png",width=800,height=600)
 par(mar=c(5,6,4,2)+0.1)
 barplot(as.integer(majorsbycollege[physics,2:(length(collegelist)+1)]), names.arg=collegelistforprint,
        cex.names=0.8, xlab="College", main="Number of Physics Majors by College")
 dev.off()

 #Mathematics by College
 mathematics <- grep("Mathematics", majorsbycollege[,1])
 png(file="mathematics.png",width=800,height=600)
 par(mar=c(5,6,4,2)+0.1)
 barplot(as.integer(majorsbycollege[mathematics,2:(length(collegelist)+1)]), names.arg=collegelistforprint,
        cex.names=0.8, xlab="College", main="Number of Mathematics Majors by College")
 dev.off()
	#Pull in the html
	x <- scan("~/Dropbox/Documents/Senior Year/Spring/STAT230/YaleFacebook3.html",
	what="", sep="\n")
	#All the student data is within the div class='display_data'
	data <- grep('display_data', x, fixed=TRUE)
	#The student data is all on the next line of html. The studens variable now contains the
	#line number of the html with all the data
	students <- x[data+1]

	#each student's info is contained within the div with class='student_text_container'
	#so use this to seperate each student into their own index in the array
	a <- unlist(strsplit(students, 'student_text_container', fixed = TRUE,
	useBytes = TRUE))

	#I further break apart the html
	b <- unlist(strsplit(a, 'div', fixed = TRUE, useBytes = TRUE))
	c <- unlist(strsplit(b, '<br>', fixed = TRUE, useBytes = TRUE))

	#Functions
	getmonthcount <- function(index) {
	month <- monthslist[index]
	array <- grep(month,c,fixed=TRUE)
	count <- length(array)
	return(count)
	}
	getcollegecount <- function(index) {
	college <- collegelist[index]
	array <- grep(college,c,fixed=TRUE)
	count <- length(array)
	return(count)
	}

	getmajorcount <- function(index) {
	major <- majorslist[index]
	array <- grep(major,c,fixed=TRUE)
	count <- length(array)
	return(count)
	}

	#Find out how many people have each birthday
	monthslist <- c("Jan","Feb","Mar","Apr","May","Jun",
	"Jul","Aug","Sep","Oct","Nov","Dec")
	months <- data.frame(months= monthslist, count=rep(NA,length(monthslist)))
	for (i in 0:(length(monthslist)-1)) months[(i+1),2] <- getmonthcount(i+1)

	#Export Graph
	png(file="birthdays.png",width=800,height=600)
	par(mar=c(5,6,4,2)+0.1)
	barplot(months$count, names.arg=months$months, xlim = NULL, cex.names=0.9,
	ylab="Students", xlab="Birthday", main="Number of Yale Students By Month of Birthday")
	dev.off()


	#Find out how many people are in each Yale College
	collegelist <- c("Berkeley College","Branford College","Calhoun College",
	"Davenport College","Ezra Stiles College",
	"Jonathan Edwards College", "Morse College","Pierson College",
	"Saybrook College","Silliman College","Timothy Dwight College",
	"Trumbull College")
	colleges <- data.frame(college= collegelist, count=rep(NA,length(collegelist)))
	for (i in 0:(length(collegelist)-1)) colleges[(i+1),2] <- getcollegecount(i+1)

	#To have a simpler chart we want to simplify the college names
	#Need the stringr library in order to do this
	library("stringr")
	collegelistforprint <- str_replace(collegelist, " College","")
	collegelistforprint <- str_replace(collegelistforprint, "Ezra Stiles", "Stiles")
	collegelistforprint <- str_replace(collegelistforprint, "Jonathan Edwards","JE")
	collegelistforprint <- str_replace(collegelistforprint, "Timothy Dwight","TD")

	#Plot the number of students by college
	png(file="colleges.png",width=800,height=600)
	par(mar=c(5,6,4,2)+0.1)
	barplot(colleges$count, names.arg=collegelistforprint, xlim = NULL, cex.names=0.8,
	xlab="College", ylab="Students", main="Number of Students By Residential College")
	dev.off()

	#An array of undergraduate majors.
	majorslist <- c(
	"African American Studies",
	"African Studies",
	"American Studies",
	"Anthropology",
	"Applied Mathematics",
	"Applied Physics",
	"Archaeological Studies",
	"Architecture",
	"Art",
	"Astronomy",
	"Astronomy & Physics",
	"Biology",
	"Chemistry",
	"Chinese",
	"Classical Civilization",
	"Classics",
	"Cognitive Science",
	"Computer Science",
	"Computer Science & Mathematics",
	"Computer Science & Psychology",
	"Computing and the Arts",
	"East Asian Studies",
	"Ecology & Evolutionary Biology",
	"Economics",
	"Economics & Mathematics",
	"Electrical Engineering",
	"Engineering",
	"English",
	"Environmental Studies",
	"Ethics,Politics & Economics",
	"Ethnicity, Race & Migration",
	"Film Studies",
	"French",
	"Geology & Geophysics",
	"German",
	"German Studies",
	"Global Affairs",
	"History",
	"History of Art",
	"Humanities",
	"Italian",
	"Latin American Studies",
	"Linguistics",
	"Literature",
	"Mathematics",
	"Mathematics & Philosophy",
	"Mathematics & Physics",
	"Modern Middle Eastern Studies",
	"Molecular Biophysics & Biochem",
	"Molecular,Cellular,Dev Biology",
	"Music",
	"Near Eastern Languages & Civs",
	"Philosophy",
	"Physics",
	"Physics & Philosophy",
	"Political Science",
	"Psychology",
	"Religious Studies",
	"Russian",
	"Sociology",
	"Spanish",
	"Statistics",
	"Theater Studies",
	"Women'sGender&SexualityStudies",
	"Undeclared")

	#Number of students for each major
	majors <- data.frame(majors = majorslist, count=rep(NA,length(majorslist)))
	for (i in 0:(length(majorslist)-1)) majors[(i+1),2] <- getmajorcount(i+1)

	#Plot Students by Major
	png(file="allmajors.png",width=800,height=1200)
	par(las=2)
	par(mar=c(5,15,4,1)+0.1)
	barplot(majors$count, names.arg=majorslist, space=0.8,
	horiz=TRUE, xlim = NULL, cex.names=1,
	xlab="Number of Students", main="Numer Of Students With Each Declared Major")
	dev.off()

	##Look at the most popular majors
	mostmajors <- subset(majors, count > 50)
	mostmajors <- subset(mostmajors, majors != "Undeclared")

	#Graph Most Popular Majors
	png(file="mostpopularmajors.png",width=800,height=600)
	par(las=2)
	par(mar=c(5,13,4,2)+0.1)
	barplot(mostmajors$count, names.arg=mostmajors$majors, space=0.5, horiz=TRUE, xlim = NULL, cex.names=0.9,
	xlab="Number of Students", main="The Most Popular Majors at Yale")
	dev.off()

	##Look at the least common majors
	fewmajors <- subset(majors,count <30)

	#Graph Least Popular Majors
	png(file="leastpopularmajors.png",width=800,height=900)
	par(las=2)
	par(mar=c(5,13,4,2)+0.1)
	barplot(fewmajors$count, names.arg=fewmajors$majors, space=0.7, horiz=TRUE, xlim = NULL, cex.names=0.9,
	xlab="Number of Students", main="The Least Popular Majors at Yale")
	dev.off()

	#Goal: the number of majors by college
	#create data frame with majors in left column
	majorsbycollege <- data.frame(major=mostmajors$majors)
	#Run through each college to get the number of studnets with the most popular
	#majors per college
	for (i in 1:(length(collegelist)))
	{
	collegetocheck <- collegelist[i]
	collegeheader <- collegelistforprint[i]
	#Add a Column & Header for that college to the matrix
	majorsbycollege[collegeheader] <- NA
	#go through each major to check the count for the college currently assessing
	#j is the row (the major)
	for (j in 1:(length(mostmajors$majors)))
	{
	majortocheck <- majorsbycollege[j,1]
	#array of locations for students with the given major
	studentswithmajor <- grep(majortocheck, a, fixed=TRUE)
	#array of locations for students with the given college
	studentswithcollege <- grep(collegetocheck,a,fixed=TRUE)
	occurances = as.integer(0)
	#check for the number of matches:
	occurancelocations <- grep(TRUE, studentswithmajor %in% studentswithcollege)
	occurances <- occurances + as.integer(length(occurancelocations))
	majorsbycollege[j,(i+1)] <- occurances
	}
	}

	#Engineering by College
	engineering <- grep("Engineering",majorsbycollege[,1])
	png(file="engineering.png",width=800,height=600)
	par(mar=c(5,6,4,2)+0.1)
	barplot(as.integer(majorsbycollege[engineering,2:(length(collegelist)+1)]), names.arg=collegelistforprint,
	cex.names=0.8, xlab="College", main="Number of Engineering Majors by College")
	dev.off()

	#Computer Science by College
	computer <- grep("Computer Science", majorsbycollege[,1])
	png(file="compsci.png",width=800,height=600)
	par(mar=c(5,6,4,2)+0.1)
	barplot(as.integer(majorsbycollege[computer,2:(length(collegelist)+1)]), names.arg=collegelistforprint,
	cex.names=0.8, xlab="College", main="Number of Computer Science Majors by College")
	dev.off()

	#Economics by College
	economics <- grep("Economics", majorsbycollege[,1])
	png(file="econ.png",width=800,height=600)
	par(mar=c(5,6,4,2)+0.1)
	#I have to use economics[1] because economics[2] is the row for Ethics, Politics & Econ
	barplot(as.integer(majorsbycollege[economics[1],2:(length(collegelist)+1)]), names.arg=collegelistforprint,
	cex.names=0.8, xlab="College", main="Number of Economics Majors by College")
	dev.off()

	#English by College
	english <- grep("English", majorsbycollege[,1])
	png(file="english.png",width=800,height=600)
	par(mar=c(5,6,4,2)+0.1)
	barplot(as.integer(majorsbycollege[english,2:(length(collegelist)+1)]), names.arg=collegelistforprint,
	cex.names=0.8, xlab="College", main="Number of English Majors by College")
	dev.off()

	#Physics by College
	physics <- grep("Physics", majorsbycollege[,1])
	png(file="physics.png",width=800,height=600)
	par(mar=c(5,6,4,2)+0.1)
	barplot(as.integer(majorsbycollege[physics,2:(length(collegelist)+1)]), names.arg=collegelistforprint,
	cex.names=0.8, xlab="College", main="Number of Physics Majors by College")
	dev.off()

	#Mathematics by College
	mathematics <- grep("Mathematics", majorsbycollege[,1])
	png(file="mathematics.png",width=800,height=600)
	par(mar=c(5,6,4,2)+0.1)
	barplot(as.integer(majorsbycollege[mathematics,2:(length(collegelist)+1)]), names.arg=collegelistforprint,
	cex.names=0.8, xlab="College", main="Number of Mathematics Majors by College")
	dev.off()