ccagrawal · December 15, 2015 12:52
diff --git a/lsn_scraper.R b/lsn_scraper.R
 options(stringsAsFactors = FALSE)

 ScrapeLSN <- function(school, cycle) {
  
  base.url <- 'http://SCHOOL.lawschoolnumbers.com/stats/CYCLE'
  
  url <- gsub('SCHOOL', school, base.url)
  url <- gsub('CYCLE', cycle, url)
  
  src <- readLines(url)
  src <- src[grep('pointWidth:', src):grep('<div id="container" style="width: 630px; height: 525px;"></div>', src)]
  
  accepted <- src[2]
  rejected <- src[14]
  
  accepted <- strsplit(accepted, '\\{')[[1]]
  accepted <- accepted[grepl('name:', accepted)]
  
  rejected <- strsplit(rejected, '\\{')[[1]]
  rejected <- rejected[grepl('name:', rejected)]
  
  a.df <- data.frame(matrix(nrow = length(accepted), ncol = 4, data = 0))
  colnames(a.df) <- c('LSAT', 'GPA', 'URM', 'Outcome')
  a.df[grepl("\\(URM)', x: ", accepted), 'URM'] <- 1
  a.df$LSAT <- as.numeric(gsub('.*x: ([0-9]*).*', '\\1', accepted))
  a.df$GPA <- as.numeric(gsub('.*y: ([0-9\\.]*).*', '\\1', accepted))
  a.df$Outcome <- 1
  
  r.df <- data.frame(matrix(nrow = length(rejected), ncol = 4, data = 0))
  colnames(r.df) <- c('LSAT', 'GPA', 'URM', 'Outcome')
  r.df[grepl("\\(URM)', x: ", rejected), 'URM'] <- 1
  r.df$LSAT <- as.numeric(gsub('.*x: ([0-9]*).*', '\\1', rejected))
  r.df$GPA <- as.numeric(gsub('.*y: ([0-9\\.]*).*', '\\1', rejected))
  r.df$Outcome <- 0
  
  data <- rbind(a.df, r.df)
  return(data)
 }

 schools <- read.csv('./schools.csv', header = FALSE)[, 1]
 train.cycles <- c('1415')
 test.cycles <- c('1314')

 df <- data.frame(matrix(nrow = length(schools), ncol = 7, data = 0))
 colnames(df) <- c('school', 'Intercept', 'LSAT', 'GPA', 'URM', 'Sample', 'Accuracy')
 df$school <- schools

 for (i in 1:nrow(df)) {
  
  school <- df[i, 'school']
  
  train.data <- data.frame()
  for (cycle in train.cycles) {
    train.data <- rbind(train.data, ScrapeLSN(school, cycle))
  }
  
  fit <- glm(Outcome ~ LSAT + GPA + URM, data = train.data, family = 'binomial')
  df[i, c('Intercept', 'LSAT', 'GPA', 'URM')] <- fit$coef
  
  test.data <- data.frame()
  for (cycle in test.cycles) {
    test.data <- rbind(test.data, ScrapeLSN(school, cycle))
  }
  
  test.data$pred <- predict(fit, newdata = test.data[, c('LSAT', 'GPA', 'URM')], type = 'response')
  test.data$pred.int <- round(test.data$pred)
  
  correct <- sum(test.data$Outcome == '1' & test.data$pred.int == 1, na.rm = TRUE) + 
    sum(test.data$Outcome == '0' & test.data$pred.int == 0, na.rm = TRUE)
  incorrect <- sum(test.data$Outcome == '1' & test.data$pred.int == 0, na.rm = TRUE) + 
    sum(test.data$Outcome == '0' & test.data$pred.int == 1, na.rm = TRUE)
  df[i, 'Sample'] <- (correct + incorrect)
  df[i, 'Accuracy'] <- correct / (correct + incorrect)
  
  cat(i, '/', nrow(df), '\n')
 }

 accuracy <- sum(df$Sample * df$Accuracy) / sum(df$Sample)

 write.csv(df, 'analysis.csv')
	options(stringsAsFactors = FALSE)

	ScrapeLSN <- function(school, cycle) {

	base.url <- 'http://SCHOOL.lawschoolnumbers.com/stats/CYCLE'

	url <- gsub('SCHOOL', school, base.url)
	url <- gsub('CYCLE', cycle, url)

	src <- readLines(url)
	src <- src[grep('pointWidth:', src):grep('<div id="container" style="width: 630px; height: 525px;"></div>', src)]

	accepted <- src[2]
	rejected <- src[14]

	accepted <- strsplit(accepted, '\\{')[[1]]
	accepted <- accepted[grepl('name:', accepted)]

	rejected <- strsplit(rejected, '\\{')[[1]]
	rejected <- rejected[grepl('name:', rejected)]

	a.df <- data.frame(matrix(nrow = length(accepted), ncol = 4, data = 0))
	colnames(a.df) <- c('LSAT', 'GPA', 'URM', 'Outcome')
	a.df[grepl("\\(URM)', x: ", accepted), 'URM'] <- 1
	a.df$LSAT <- as.numeric(gsub('.x: ([0-9]).*', '\\1', accepted))
	a.df$GPA <- as.numeric(gsub('.y: ([0-9\\.]).*', '\\1', accepted))
	a.df$Outcome <- 1

	r.df <- data.frame(matrix(nrow = length(rejected), ncol = 4, data = 0))
	colnames(r.df) <- c('LSAT', 'GPA', 'URM', 'Outcome')
	r.df[grepl("\\(URM)', x: ", rejected), 'URM'] <- 1
	r.df$LSAT <- as.numeric(gsub('.x: ([0-9]).*', '\\1', rejected))
	r.df$GPA <- as.numeric(gsub('.y: ([0-9\\.]).*', '\\1', rejected))
	r.df$Outcome <- 0

	data <- rbind(a.df, r.df)
	return(data)
	}

	schools <- read.csv('./schools.csv', header = FALSE)[, 1]
	train.cycles <- c('1415')
	test.cycles <- c('1314')

	df <- data.frame(matrix(nrow = length(schools), ncol = 7, data = 0))
	colnames(df) <- c('school', 'Intercept', 'LSAT', 'GPA', 'URM', 'Sample', 'Accuracy')
	df$school <- schools

	for (i in 1:nrow(df)) {

	school <- df[i, 'school']

	train.data <- data.frame()
	for (cycle in train.cycles) {
	train.data <- rbind(train.data, ScrapeLSN(school, cycle))
	}

	fit <- glm(Outcome ~ LSAT + GPA + URM, data = train.data, family = 'binomial')
	df[i, c('Intercept', 'LSAT', 'GPA', 'URM')] <- fit$coef

	test.data <- data.frame()
	for (cycle in test.cycles) {
	test.data <- rbind(test.data, ScrapeLSN(school, cycle))
	}

	test.data$pred <- predict(fit, newdata = test.data[, c('LSAT', 'GPA', 'URM')], type = 'response')
	test.data$pred.int <- round(test.data$pred)

	correct <- sum(test.data$Outcome == '1' & test.data$pred.int == 1, na.rm = TRUE) +
	sum(test.data$Outcome == '0' & test.data$pred.int == 0, na.rm = TRUE)
	incorrect <- sum(test.data$Outcome == '1' & test.data$pred.int == 0, na.rm = TRUE) +
	sum(test.data$Outcome == '0' & test.data$pred.int == 1, na.rm = TRUE)
	df[i, 'Sample'] <- (correct + incorrect)
	df[i, 'Accuracy'] <- correct / (correct + incorrect)

	cat(i, '/', nrow(df), '\n')
	}

	accuracy <- sum(df$Sample * df$Accuracy) / sum(df$Sample)

	write.csv(df, 'analysis.csv')
No results found