drivendata · May 28, 2015 15:18
diff --git a/functions_to_keep_it_fresh.R b/functions_to_keep_it_fresh.R
 # INSTALL JSON PACKAGE IF NEEDED
 #install.packages("jsonlite")
 library("jsonlite")

 # ===== FUNCTIONS FOR ID MATCHING ============
 getReverseIds <- function(pathToRestaurantIdsCSV){
  # load the id map from the csv 
  bosToYelp <- read.csv(pathToRestaurantIdsCSV, na.strings="")
  
  # for storing the inverse mapping
  reversedColumns <- list()
  
  for(j in seq(2, ncol(bosToYelp))){
    # get each column and the restaurant ids
    reverse <- bosToYelp[, c(1, j)]
    
    # drop any nans
    reverse <- reverse[complete.cases(reverse),]
    
    # switch the rownames to the yelp id
    row.names(reverse) <- reverse[, 2]
    
    # drop the yelp id column
    reversedColumns[[j]] <- reverse[, 1, drop=F]    
  }
  
  # stack the reversed columns
  return(do.call("rbind", reversedColumns))
 }

 replaceYelpWithBostonIds <- function(yelpDataFrame, pathToRestaurantIdsCSV){
  # make the yelp id the index and the bos id the value
  yelpToBos <- getReverseIds(pathToRestaurantIdsCSV)
  
  # replace whatever the ids in the dataframe
  yelpDataFrame$business_id <- yelpToBos[match(yelpDataFrame$business_id, row.names(yelpToBos)), ]
  
  return(yelpDataFrame)
 }

 # ===== FUNCTIONS FOR LOADING YELP DATA ============
 loadYelpData <- function(pathToYelpJson, pathToRestaurantIdsCSV){
  # get all the lines in the file and separate by a ","
  jsonData <- paste(readLines(pathToYelpJson), collapse=",")
  
  # add "[" "]" to tell JSON parser it is an array 
  jsonData <- paste(c("[", jsonData, "]"), collapse="")
  
  # parse the data into a 2d data frame
  yelpDf <- fromJSON(jsonData, flatten=TRUE)

  # replace yelp ids with boston ids if this file has business ids
  if(is.element("business_id", colnames(yelpDf))){
    yelpDf <- replaceYelpWithBostonIds(yelpDf, pathToRestaurantIdsCSV)
  }
  
  return(yelpDf)
 }

 # ===== FUNCTIONS FOR INSPECTION DATA ===========
 loadInspections <- function(pathToInspectionsCSV){
  inspections <- read.csv(pathToInspectionsCSV,
                          header=T,
                          row.names=1)
  
  colnames(inspections) <- c("date", "restaurant_id", "one_star", "two_stars", "three_stars")
  
  return(inspections)
 }

 writeSubmission <- function(predictionDataFrame, pathToSubmissionFormat, submissionFileName="new_submission.csv"){
  # get the submission format from the file
  submissionFormat <- read.csv(pathToSubmissionFormat, check.names=FALSE, row.names=1)
  
  # update the predictions with the proper column names
  colnames(predictionDataFrame) <- colnames(submissionFormat)
  
  # write the predictions to a file
  write.csv(predictionDataFrame, submissionFileName)
 }

 # ====================================
 #      LOAD THE DATA FROM DISK
 # ====================================

 # Load all of the datums!!
 businesses <- loadYelpData("data/yelp_academic_dataset_business.json", "data/restaurant_ids_to_yelp_ids.csv")
 reviews <- loadYelpData("data/yelp_academic_dataset_review.json", "data/restaurant_ids_to_yelp_ids.csv")
 checkins <- loadYelpData("data/yelp_academic_dataset_checkin.json", "data/restaurant_ids_to_yelp_ids.csv")
 users <- loadYelpData("data/yelp_academic_dataset_user.json", "data/restaurant_ids_to_yelp_ids.csv")
 tips <- loadYelpData("data/yelp_academic_dataset_tip.json", "data/restaurant_ids_to_yelp_ids.csv")

 train <- loadInspections("data/train_labels.csv")
 test <- loadInspections("data/SubmissionFormat.csv")

 # ====================================
 #      Make a simple test model
 # ====================================
 makeSimplePredictions <- function(){
  # We will just create a linear model fit on the number of reviews and the average
  # number of stars for the restaurant
  X_train <- businesses[match(train$restaurant_id, businesses$business_id), c("stars", "review_count")]
  X_test <- businesses[match(test$restaurant_id, businesses$business_id), c("stars", "review_count")]
  
  finalPredictions <- test
  
  for(starLevel in c("one_star", "two_stars", "three_stars")){
    # create formula for this level of violations 
    formula <- paste(c(starLevel, " ~ stars + review_count"), collapse="")
                     
    # fit a simple linear model
    model <- lm(formula=formula, data=cbind(train, X_train))
    
    # predict the violations
    predictions <- predict.lm(model, cbind(test, X_test))
    
    # force them to be integers (counts)
    predictions <- as.integer(predictions)
    
    # force them to be greater than 0
    predictions[predictions < 0] <- 0
    
    # store the predictions
    finalPredictions[starLevel] <- predictions
  }
  
  writeSubmission(finalPredictions, "data/SubmissionFormat.csv", "stars_review_count.csv")
 }
 makeSimplePredictions()
	# INSTALL JSON PACKAGE IF NEEDED
	#install.packages("jsonlite")
	library("jsonlite")

	# ===== FUNCTIONS FOR ID MATCHING ============
	getReverseIds <- function(pathToRestaurantIdsCSV){
	# load the id map from the csv
	bosToYelp <- read.csv(pathToRestaurantIdsCSV, na.strings="")

	# for storing the inverse mapping
	reversedColumns <- list()

	for(j in seq(2, ncol(bosToYelp))){
	# get each column and the restaurant ids
	reverse <- bosToYelp[, c(1, j)]

	# drop any nans
	reverse <- reverse[complete.cases(reverse),]

	# switch the rownames to the yelp id
	row.names(reverse) <- reverse[, 2]

	# drop the yelp id column
	reversedColumns[[j]] <- reverse[, 1, drop=F]
	}

	# stack the reversed columns
	return(do.call("rbind", reversedColumns))
	}

	replaceYelpWithBostonIds <- function(yelpDataFrame, pathToRestaurantIdsCSV){
	# make the yelp id the index and the bos id the value
	yelpToBos <- getReverseIds(pathToRestaurantIdsCSV)

	# replace whatever the ids in the dataframe
	yelpDataFrame$business_id <- yelpToBos[match(yelpDataFrame$business_id, row.names(yelpToBos)), ]

	return(yelpDataFrame)
	}

	# ===== FUNCTIONS FOR LOADING YELP DATA ============
	loadYelpData <- function(pathToYelpJson, pathToRestaurantIdsCSV){
	# get all the lines in the file and separate by a ","
	jsonData <- paste(readLines(pathToYelpJson), collapse=",")

	# add "[" "]" to tell JSON parser it is an array
	jsonData <- paste(c("[", jsonData, "]"), collapse="")

	# parse the data into a 2d data frame
	yelpDf <- fromJSON(jsonData, flatten=TRUE)

	# replace yelp ids with boston ids if this file has business ids
	if(is.element("business_id", colnames(yelpDf))){
	yelpDf <- replaceYelpWithBostonIds(yelpDf, pathToRestaurantIdsCSV)
	}

	return(yelpDf)
	}

	# ===== FUNCTIONS FOR INSPECTION DATA ===========
	loadInspections <- function(pathToInspectionsCSV){
	inspections <- read.csv(pathToInspectionsCSV,
	header=T,
	row.names=1)

	colnames(inspections) <- c("date", "restaurant_id", "one_star", "two_stars", "three_stars")

	return(inspections)
	}

	writeSubmission <- function(predictionDataFrame, pathToSubmissionFormat, submissionFileName="new_submission.csv"){
	# get the submission format from the file
	submissionFormat <- read.csv(pathToSubmissionFormat, check.names=FALSE, row.names=1)

	# update the predictions with the proper column names
	colnames(predictionDataFrame) <- colnames(submissionFormat)

	# write the predictions to a file
	write.csv(predictionDataFrame, submissionFileName)
	}

	# ====================================
	# LOAD THE DATA FROM DISK
	# ====================================

	# Load all of the datums!!
	businesses <- loadYelpData("data/yelp_academic_dataset_business.json", "data/restaurant_ids_to_yelp_ids.csv")
	reviews <- loadYelpData("data/yelp_academic_dataset_review.json", "data/restaurant_ids_to_yelp_ids.csv")
	checkins <- loadYelpData("data/yelp_academic_dataset_checkin.json", "data/restaurant_ids_to_yelp_ids.csv")
	users <- loadYelpData("data/yelp_academic_dataset_user.json", "data/restaurant_ids_to_yelp_ids.csv")
	tips <- loadYelpData("data/yelp_academic_dataset_tip.json", "data/restaurant_ids_to_yelp_ids.csv")

	train <- loadInspections("data/train_labels.csv")
	test <- loadInspections("data/SubmissionFormat.csv")

	# ====================================
	# Make a simple test model
	# ====================================
	makeSimplePredictions <- function(){
	# We will just create a linear model fit on the number of reviews and the average
	# number of stars for the restaurant
	X_train <- businesses[match(train$restaurant_id, businesses$business_id), c("stars", "review_count")]
	X_test <- businesses[match(test$restaurant_id, businesses$business_id), c("stars", "review_count")]

	finalPredictions <- test

	for(starLevel in c("one_star", "two_stars", "three_stars")){
	# create formula for this level of violations
	formula <- paste(c(starLevel, " ~ stars + review_count"), collapse="")

	# fit a simple linear model
	model <- lm(formula=formula, data=cbind(train, X_train))

	# predict the violations
	predictions <- predict.lm(model, cbind(test, X_test))

	# force them to be integers (counts)
	predictions <- as.integer(predictions)

	# force them to be greater than 0
	predictions[predictions < 0] <- 0

	# store the predictions
	finalPredictions[starLevel] <- predictions
	}

	writeSubmission(finalPredictions, "data/SubmissionFormat.csv", "stars_review_count.csv")
	}
	makeSimplePredictions()