Created
May 28, 2015 15:18
-
-
Save drivendata/52dbb123980e8a62e5fa to your computer and use it in GitHub Desktop.
functions_to_keep_it_fresh.R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# INSTALL JSON PACKAGE IF NEEDED | |
#install.packages("jsonlite") | |
library("jsonlite") | |
# ===== FUNCTIONS FOR ID MATCHING ============ | |
getReverseIds <- function(pathToRestaurantIdsCSV){ | |
# load the id map from the csv | |
bosToYelp <- read.csv(pathToRestaurantIdsCSV, na.strings="") | |
# for storing the inverse mapping | |
reversedColumns <- list() | |
for(j in seq(2, ncol(bosToYelp))){ | |
# get each column and the restaurant ids | |
reverse <- bosToYelp[, c(1, j)] | |
# drop any nans | |
reverse <- reverse[complete.cases(reverse),] | |
# switch the rownames to the yelp id | |
row.names(reverse) <- reverse[, 2] | |
# drop the yelp id column | |
reversedColumns[[j]] <- reverse[, 1, drop=F] | |
} | |
# stack the reversed columns | |
return(do.call("rbind", reversedColumns)) | |
} | |
replaceYelpWithBostonIds <- function(yelpDataFrame, pathToRestaurantIdsCSV){ | |
# make the yelp id the index and the bos id the value | |
yelpToBos <- getReverseIds(pathToRestaurantIdsCSV) | |
# replace whatever the ids in the dataframe | |
yelpDataFrame$business_id <- yelpToBos[match(yelpDataFrame$business_id, row.names(yelpToBos)), ] | |
return(yelpDataFrame) | |
} | |
# ===== FUNCTIONS FOR LOADING YELP DATA ============ | |
loadYelpData <- function(pathToYelpJson, pathToRestaurantIdsCSV){ | |
# get all the lines in the file and separate by a "," | |
jsonData <- paste(readLines(pathToYelpJson), collapse=",") | |
# add "[" "]" to tell JSON parser it is an array | |
jsonData <- paste(c("[", jsonData, "]"), collapse="") | |
# parse the data into a 2d data frame | |
yelpDf <- fromJSON(jsonData, flatten=TRUE) | |
# replace yelp ids with boston ids if this file has business ids | |
if(is.element("business_id", colnames(yelpDf))){ | |
yelpDf <- replaceYelpWithBostonIds(yelpDf, pathToRestaurantIdsCSV) | |
} | |
return(yelpDf) | |
} | |
# ===== FUNCTIONS FOR INSPECTION DATA =========== | |
loadInspections <- function(pathToInspectionsCSV){ | |
inspections <- read.csv(pathToInspectionsCSV, | |
header=T, | |
row.names=1) | |
colnames(inspections) <- c("date", "restaurant_id", "one_star", "two_stars", "three_stars") | |
return(inspections) | |
} | |
writeSubmission <- function(predictionDataFrame, pathToSubmissionFormat, submissionFileName="new_submission.csv"){ | |
# get the submission format from the file | |
submissionFormat <- read.csv(pathToSubmissionFormat, check.names=FALSE, row.names=1) | |
# update the predictions with the proper column names | |
colnames(predictionDataFrame) <- colnames(submissionFormat) | |
# write the predictions to a file | |
write.csv(predictionDataFrame, submissionFileName) | |
} | |
# ==================================== | |
# LOAD THE DATA FROM DISK | |
# ==================================== | |
# Load all of the datums!! | |
businesses <- loadYelpData("data/yelp_academic_dataset_business.json", "data/restaurant_ids_to_yelp_ids.csv") | |
reviews <- loadYelpData("data/yelp_academic_dataset_review.json", "data/restaurant_ids_to_yelp_ids.csv") | |
checkins <- loadYelpData("data/yelp_academic_dataset_checkin.json", "data/restaurant_ids_to_yelp_ids.csv") | |
users <- loadYelpData("data/yelp_academic_dataset_user.json", "data/restaurant_ids_to_yelp_ids.csv") | |
tips <- loadYelpData("data/yelp_academic_dataset_tip.json", "data/restaurant_ids_to_yelp_ids.csv") | |
train <- loadInspections("data/train_labels.csv") | |
test <- loadInspections("data/SubmissionFormat.csv") | |
# ==================================== | |
# Make a simple test model | |
# ==================================== | |
makeSimplePredictions <- function(){ | |
# We will just create a linear model fit on the number of reviews and the average | |
# number of stars for the restaurant | |
X_train <- businesses[match(train$restaurant_id, businesses$business_id), c("stars", "review_count")] | |
X_test <- businesses[match(test$restaurant_id, businesses$business_id), c("stars", "review_count")] | |
finalPredictions <- test | |
for(starLevel in c("one_star", "two_stars", "three_stars")){ | |
# create formula for this level of violations | |
formula <- paste(c(starLevel, " ~ stars + review_count"), collapse="") | |
# fit a simple linear model | |
model <- lm(formula=formula, data=cbind(train, X_train)) | |
# predict the violations | |
predictions <- predict.lm(model, cbind(test, X_test)) | |
# force them to be integers (counts) | |
predictions <- as.integer(predictions) | |
# force them to be greater than 0 | |
predictions[predictions < 0] <- 0 | |
# store the predictions | |
finalPredictions[starLevel] <- predictions | |
} | |
writeSubmission(finalPredictions, "data/SubmissionFormat.csv", "stars_review_count.csv") | |
} | |
makeSimplePredictions() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment