jknowles · December 4, 2018 22:46
diff --git a/cpe_functions.R b/cpe_functions.R
 ################################################################################
 # Functions to find the data
 ################################################################################

 # Finders
 # Simple functions that take the ID code from CPE (e.g. 49-00039) and look up 
 # the respective data for it in the structure provided in teh competition
 find_police_shape <- function(dept_id, kaggle_kernel = FALSE) {
  if(kaggle_kernel == TRUE) {
    prefix = "../input/cpe-data/"
  } else {
    prefix = "data/"
  }
  # This function will be modified when placed in a production environment to 
  # follow the data storage/path scheme used in production. Could be modified 
  # to even fetch data from a DB or from a web-service
  fl <- list.files(paste0(prefix, "Dept_", dept_id, "/", dept_id, "_Shapefiles"), 
                   full.names = TRUE)
  path_to_sf <- grep(".shp", fl, value = TRUE)[1]
  return(path_to_sf)
 }

 find_police_csv <- function(dept_id, kaggle_kernel = FALSE) {
  if(kaggle_kernel == TRUE) {
    prefix = "../input/cpe-data/"
  } else {
    prefix = "data/"
  }
  fl <- list.files(paste0(prefix, "Dept_", dept_id), 
                   full.names = TRUE)
  # What if there is more than 1 csv?
  # For now we just return first
  path_to_csv <- grep(".csv", fl, value = TRUE)[1]
  return(path_to_csv)
  
 }

 #############-----------------
 # Functions to read in the data
 ###############--------------

 get_county <- function(city, state) {
  result <- ggmap::geocode(paste0(city, ",", state), output = "latlona", source = "google")
  # rounding?
  result[, 1] <- round(result[, 1], 2)
  result[, 2] <- round(result[, 2], 2)
  county <- latlong2county(result[, 1:2])
  county <- unlist(lapply(strsplit(county, split = ","), "[", 2))
  # Check for NA
  # return mc for missincounty
  # round to a lower resolution
  mc <- which(is.na(county))
  result[mc, 1] <- round(result[mc, 1], 1)
  result[mc, 2] <- round(result[mc, 2], 1)
  county2 <- latlong2county(result[mc, 1:2])
  county2 <- unlist(lapply(strsplit(county2, split = ","), "[", 2))
  county[mc] <- county2
  return(county)
 }

 # Retrievers
 # Need Google for geocoding the county
 get_police_zones <- function(path_to_sf, city, state, county) {
  # First we read in the shapefile
  police_districts <- rgdal::readOGR(dsn = path_to_sf,
                                     stringsAsFactors = FALSE)
  # Next, we standardize all police shapefiles as they come in to use a 
  # projection compatible with the ACS. We do this by downloading the TIGRIS 
  # file for each shapefile and getting its projection string
  admin_shapes <- tigris::tracts(state = state, county = county, cb = TRUE)
  # Extract projection
  proj_use <- CRS(paste0(admin_shapes@proj4string))
  # # Currently Austin TX is missing any projection information in the shapefile
  # # This function includes a workaround to project Austin to the coordinate 
  # # reference system used by the police x,y coordinate system in the Austin 
  # # data. This section could include more robust logic after analyzing additional 
  # # shapefiles with non-standard projections.
  if (is.na(police_districts@proj4string)) {
    proj4string(police_districts) <- CRS("+init=epsg:3664")
  }
  # Reproject the shapefile using the spTransform function
  police_districts <- sp::spTransform(police_districts, proj_use)
  # One funny thing is that police boundary files can extend far out into the 
  # water. This makes resulting maps look unfamiliar to the public. We can use 
  # the Census TIGRIS files to fetch the water boundaries and then slice off 
  # # any police boundaries that extend into the water
  outline <- tigris::counties(state = state,
                              cb = TRUE,
                              resolution = "500k")
  # Define the intersection
  outline <- rgeos::gUnaryUnion(outline)
  # Slice and return only the overlap
  police_districts <- rgeos::intersect(police_districts, outline)
  return(police_districts)
 }

 # Get the CSV police data
 get_police_data <- function(path_to_pd_data) {
  # Most of the CPE data files seem to have two headers
  # This just removes the second header and keeps the first
  # Which appears to be more standardized
  all_content = readLines(path_to_pd_data, encoding = "UTF-8")
  skip_second = all_content[-2]
  # If the datafiles got huge it could be worth considering an alternative 
  # function to read the data in, but for this size read.csv works fine
  dat <- read.csv(file = textConnection(skip_second, encoding = "UTF-8"), 
                  stringsAsFactors = FALSE, 
                  header = TRUE, fileEncoding = "UTF-8 BOM")
  # CSV files can be encoded weird, some of the files here have a BOM at 
  # the start of the file, a UTF-8 BOM
  # Without too much work we can just sub it out if it appears
  if (any(grepl("ï..", names(dat)))) {
    names(dat) <- gsub("ï..", replacement = "", names(dat))
  }
  # Handling for bad column name
  if ("SUBJECT_RACT" %in% names(dat)) {
    dat$SUBJECT_RACE <- dat$SUBJECT_RACT
    dat$SUBJECT_RACT <- NULL
  }
  return(dat)
 }

 # Get Census data from the API
 get_census_data <- function(state, county, geography = "tract", 
                            variables = NULL) {
  # We can start with a default set of Census variables, but the user 
  # can pass a named character vector of variables and their Census ID number
  if (is.null(variables)) {
    message("Using default variables")
    variables <- c("total_people" = "DP05_0028",
                   "total_households" = "DP02_0001", 
                   "total_family_households" = "DP02_0002",
                   "total_white" = "DP05_0032", 
                   "total_black" = "DP05_0033", 
                   "total_hispanic" = "DP05_0066", 
                   "age_over_18" = "DP05_0018")
  }
  census_geo <- tidycensus::get_acs(geography = geography, variables = variables,
                                    state = state, county = county, geometry = TRUE)
  return(census_geo)
 }

 ##################--------------------------------------------------------
 ## Clean and normalize geometry data
 #################---------------------------------------------


 # Make shape file polygon dataframe have unique rownames
 makeUniform <- function(SPDF){
  pref <- substitute(SPDF)  #just putting the file name in front.
  newSPDF <- spChFIDs(SPDF, 
                      as.character(paste(pref, 
                                         rownames(as(SPDF,"data.frame")), 
                                         sep = "_")))
  return(newSPDF)
 }

 # Census data bound to police polygon data
 census_to_pd_bound <- function(pd_poly, census_poly, pd_poly_id = NULL, 
                               type = c("sums", "means")) {
  # Use GEOID as the default
  if(is.null(pd_poly_id)) {
    pd_poly_id <- names(pd_poly@data)[1]
    # pd_poly_id <- "GEOID"
  }
  # We need to convert to a spatial frame object to take advantage of area
  # function and in so doing need to preserve the coordinate systems
  census_poly <- st_transform(census_poly, as.character(pd_poly@proj4string))
  
  try({
    cd_poly <- sf::as_Spatial(st_cast(census_poly, "MULTIPOLYGON"))
  })
  if(!exists("cd_poly")) {
    cd_poly <- sf::as_Spatial(st_cast(census_poly, "POLYGON"))
  }
  # Unify coordinate systems
  if(class(pd_poly) == "SpatialPointsDataFrame") {
    pd_poly <- spTransform(pd_poly, CRS(st_crs(census_poly)$proj4string))
  } else {
    pd_poly <- makeUniform(pd_poly)
    pd_poly <- spTransform(pd_poly, CRS(st_crs(census_poly)$proj4string))
  }
  # Compute the area of each tract and append it, this is needed for computing 
  # the proportion lying in the police boundary
  cd_poly@data$tract_area <- area(cd_poly)
  # Find the census vars we are gong to use
  vars <- unique(cd_poly$variable)
  # set up a dataframe to store our variables in a loop
  out <- as.data.frame(setNames(replicate(length(vars),numeric(0), simplify = F), 
                                vars))
  # We need to loop through each polygon in the police data
  # Find every polygon it intersects with in the Census data
  # then we calculate the proportion of the Census polygon that lies within the PD
  # polygon
  # We create a weight for each Census polygon based on the proportion of its 
  # area in the police polygon
  # Then, depending on our measure type, we take a weighted sum (counts) or a 
  # weighted mean (percentages, rates, average income, etc.)
  for(i in unique(pd_poly@data[, pd_poly_id])){
    # Slice out one polygon at a time
    tmp_poly <- pd_poly[pd_poly@data[,pd_poly_id ]== i,]
    # Workaround for column names not stored correctly
    if(nrow(tmp_poly@data) == 0) {
      tmp_poly <- pd_poly[pd_poly@data[,pd_poly_id ]== as.character(i),]
    }
    # Compute the intersection of the census and the selected PD polygon
    zzz <- raster::intersect(cd_poly, tmp_poly)
    # calculate the intersection area
    zzz@data$intersect_area <- area(zzz)
    # Now take the proportion of the intersecting area out of the total area
    # for all the tracts that cross into the PD district
    zzz@data$weight <- zzz@data$intersect_area/zzz@data$tract_area
    
    if(type == "sums") {
      out_df <- zzz@data %>% group_by(variable) %>% 
        mutate(weight_est = estimate * weight) %>% 
        # We ignore NAs here because if the Census polygon has no data, we 
        # can safely count it as 0
        summarize(estimate = sum(weight_est, na.rm = TRUE)) %>% 
        mutate("id" = i) %>%
        as.data.frame()
    } else if(type == "means") {
      out_df <- zzz@data %>% group_by(variable) %>% 
        summarize(estimate = weighted.mean(estimate, weight, na.rm = TRUE)) %>% 
        mutate("id" = i) %>%
        as.data.frame()
    }
    
    out_df <- reshape(out_df, direction = "wide", timevar = "variable")
    out <- rbind(out, out_df)
  }
  # key_by <- c(pd_poly_id = "id")
  pd_poly@data <- merge(pd_poly@data, out, by.x = pd_poly_id, 
                        by.y = "id")
  return(pd_poly)
 }

 # Example census data variable definitions
 census_count_variables <- 
  c("total_people" = "DP05_0028",
    "total_households" = "DP02_0001", 
    "total_family_households" = "DP02_0002",
    "total_white" = "DP05_0032", 
    "total_black" = "DP05_0033", 
    "total_hispanic" = "DP05_0066", 
    "age_over_18" = "DP05_0018")


 census_rate_variables <- 
  c("percent_white" = "DP05_0032P", 
    "occupied_housing" = "DP04_0002P", 
    "health_insurance_cov" = "DP03_0096P", 
    "median_family_income" = "DP03_0086")

 # Take the list of Census Data and police data and combine it into a new police
 # boundary object with the census data
 augment_census_data <- function(city_list) {
  census_sf <- get_census_data(state = city_list$state, county = city_list$county, 
                               geography = "tract", variables = census_count_variables)
  zzz <- census_to_pd_bound(pd_poly = city_list$pz,
                            census_poly = census_sf, type = "sums")
  census_sf <- get_census_data(state = city_list$state, county = city_list$county,
                               geography = "tract", variables = census_rate_variables)
  zzb <- census_to_pd_bound(pd_poly = city_list$pz,
                            census_poly = census_sf, type = "means")
  zzz@data <- left_join(zzz@data, zzb@data)
  return(zzz)
 }

 # Unify police boundaries and combine police boundaries with police data
 augment_pd_census <- function(city_list) {
  if ("lat" %in% names(city_list$pd)) {
    # drop NA
    df <- city_list$pd[!is.na(city_list$pd$lat), ]
    city_points <- st_as_sf(x = df, 
                            coords = c("lon", "lat"), 
                            crs = as.character(city_list$pz@proj4string))
    out <- st_intersection(city_points, st_as_sf(city_list$pz))
  } else {
    experiment <- st_as_sf(city_list$pz)
    experiment$LOCATION_DISTRICT <- as.character(experiment$LOCATION_DISTRICT)
    city_list$pd$LOCATION_DISTRICT <- as.character(city_list$pd$LOCATION_DISTRICT)
    out <- left_join(city_list$pd, experiment, by = c("LOCATION_DISTRICT"))
  }
  return(out)
 }


 #########################--------------------------------- 
 # Cleaning data
 ###################_-----------------------------------------

 # Functions that take a dataframe from CPE and return the dataframe with columns 
 # cleaned

 detect_cpe_data_type <- function(df) {
  # Rough and ready rules for using column logic to detect the type of 
  # incident data in the CPE files
  # Four possiblities for now
  # UOF
  # UOF + INJURY
  # OIS
  # SEARCH+VEHICLE
  # ARREST
  if (any(grepl("TYPE_OF_FORCE_USED", names(df)))) {
    if (any(grepl("INJURY", names(df)))) { 
      return("UOF+INJURY")
    } else {
      return("UOF")
    }
  } else if (any(c("SUBJECT_INJURY_TYPE", 
                   "SUBJECT_DEATH") %in% names(df))) {
    return("OIS")
  }
  if (any(grepl("SEARCH", names(df)))) {
    return("SEARCH+VEHICLE")
  } else {
    return("ARREST")
  }
 }

 # Normalize and standardize incident IDs
 recode_incident_ids <- function(df) {
  if("RIN" %in% names(df)) {
    df$INCIDENT_UNIQUE_IDENTIFIER <- df$RIN
    df$RIN <- NULL
  }
  # Double ID from Boston
  if("INCIDENT_UNIQUE_IDENTIFIER.1" %in% names(df)) {
    df$INCIDENT_UNIQUE_IDENTIFIER <- paste(df$INCIDENT_UNIQUE_IDENTIFIER, 
                                           df$INCIDENT_UNIQUE_IDENTIFIER.1, sep = "-")
    df$INCIDENT_UNIQUE_IDENTIFIER.1 <- NULL
  }
  # Seattle
  if("INCIDENT_UNIQUE.IDENTIFIER.1" %in% names(df)) {
    df$INCIDENT_UNIQUE_IDENTIFIER <- paste(df$INCIDENT_UNIQUE_IDENTIFIER, 
                                           df$INCIDENT_UNIQUE.IDENTIFIER.1, sep = "-")
    df$INCIDENT_UNIQUE.IDENTIFIER.1 <- NULL
  }
  # Deal with UOF number for dallas
  if("UOF_NUMBER" %in% names(df)) {
    df$INCIDENT_UNIQUE_IDENTIFIER <- df$UOF_NUMBER
    df$UOF_NUMBER <- NULL
  }
  # Deal with faulty encoding
  if(!"INCIDENT_UNIQUE_IDENTIFIER" %in% names(df) & 
     any(grepl("INCIDENT_UNIQUE_IDENTIFIER", names(df)))) {
    var <- grep("INCIDENT_UNIQUE_IDENTIFIER", names(df), value = TRUE)
    df$INCIDENT_UNIQUE_IDENTIFIER <- df[, var]
    df[, var] <- NULL
    
  }
  
  if(!"INCIDENT_UNIQUE_IDENTIFIER" %in% names(df)) {
    df$INCIDENT_UNIQUE_IDENTIFIER <- 1:nrow(df)
  }
  return(df)
 }

 # Calculate the proportion of a vector that is missing
 # Use to identify the quality of latitude and longitude coordinates
 prop_na <- function(x) {
  D <- length(x)
  N <- length(x[is.na(x)])
  return(round((N/D), digits = 2))
 }


 # Fix Austin data to have proper coordinates
 # Work with other variables to standardize on lat lon variable names
 normalize_location <- function(df) {
  if (any(grepl("latitude", names(df), ignore.case = TRUE))) {
    # Calculate proportion missing lat/lon
    # We can use this in the current set of data to identify alternative coordinate 
    # storage. 
    missing_lat <- prop_na(df[, grep("latitude", names(df), ignore.case = TRUE, value = TRUE)])
    missing_long <- prop_na(df[, grep("longitude", names(df), ignore.case = TRUE, value = TRUE)])
    if (any(c(missing_lat, missing_long) > 0.1) & 
        any(grepl("Y_COORDINATE", names(df), ignore.case = TRUE))) {
      utm_coords <- df[, c("Y_COORDINATE", "Y_COORDINATE.1")]
      # Convert to numeric
      utm_coords <- sapply(utm_coords, as.numeric)
      # Trim an obvious outlier
      utm_coords <- na.omit(utm_coords[utm_coords[, 1] < 38800000, ])
      # lookup table for reference system: EPSG <- make_EPSG()
      # Close EPSG:3664
      # Look up the refernece system
      crs <- CRS("+init=epsg:3664")
      # Identical EPSG:2277
      # Convert to spatial points object
      sputm <- SpatialPoints(coords = utm_coords[, c(1, 2)],
                             crs)
      # conver tto spatial points
      spgeo <- spTransform(sputm, CRS("+proj=longlat +datum=WGS84"))
      # Rejoin
      # For now let's not worry about reconciling all possible data points, we only
      # keep those we can geocode which is the vast majority
      df <- df[!is.na(as.numeric(df$Y_COORDINATE.1)), ]
      df <- df[df[, "Y_COORDINATE"] < 38800000, ]
      df[, colnames(spgeo@coords)] <- spgeo@coords
      names(df)[which(names(df) %in% colnames(spgeo@coords))] <- c("lon", "lat")
      df$Y_COORDINATE <- NULL
      df$Y_COORDINATE.1 <- NULL
      df$LOCATION_LATITUDE <- NULL
      df$LOCATION_LONGITUDE <- NULL
    } else {
      names(df)[grep("latitude", names(df), ignore.case = TRUE)] <- "lat"
      names(df)[grep("longitude", names(df), ignore.case = TRUE)] <- "lon"
      
    }
  }
  return(df)
 }



 # Search for closely matching factor levels between two vectors of possible IDs
 # This function could definitely be improved!
 fuzz_finder <- function(levels, target) {
  target <- as.character(target)
  levels <- list(levels)
  score <- 0
  for(i in 1:length(levels)) {
    adder <- sum(agrepl(pattern = levels[[1]][i], x = target, 
                        max.distance = list("deletions" = 0.25, "insertions" = 0.5, 
                                            "cost" = 0.5)))
    adder <- ifelse(is.na(adder), 0, adder)
    score <- score + adder
  }
  return(score)
 }

 # Normalize district IDs between pz and pd by renaming the ID variable in the 
 # Polygon the same as in the dataset LOCATION_DISTRICT
 align_location_districts <- function(city_list) {
  if(!"LOCATION_DISTRICT" %in% names(city_list$pd)) {
    return(city_list$pz)
  } else {
    pd_ids <- unique(city_list$pd$LOCATION_DISTRICT)
    best_score <- 0
    best_col <- 0
    for (i in 1:ncol(city_list$pz@data)) {
      score <- fuzz_finder(pd_ids, city_list$pz@data[,i])
      # score <- sum(pd_ids %in% city_list$pz@data[, i])
      best_col <- ifelse(score > best_score, i, best_col)
      best_score <- ifelse(score > best_score, score, best_score)
    }
    city_list$pz@data$LOCATION_DISTRICT <- city_list$pz@data[, best_col]
    return(city_list$pz)
  }
 }

 # Standardize USE OF FORCE LEVELS in CPE data
 force_6_lev <- function(x) {
  # Function adapted from
  # https://www.kaggle.com/fkosmowski/part-1-standardizing-use-of-force-datasets
  for(i in 1:ncol(x)) {
    var <- as.character(x[, i])
    var[str_detect(var, paste(c('Level 1', 'level1','LEVEL 1', "LEVEL1", "Display", 'display'), 
                              collapse = '|'))] <- "Officer presence (L1)"
    var[str_detect(var, paste(c('Level 2', 'level2','LEVEL 2', "LEVEL2", "Verbal", "VERBAL"), 
                              collapse = '|'))] <- "Verbal commands (L2)"
    var[str_detect(var, paste(c('Level 3', 'level3','LEVEL 3', "LEVEL3", 
                                "Pressure", "PRESSURE", 'Weight','WEiGHT', 'LVNR',
                                'Handcuffing', 'Hand', 'HAND','Down', 'DOWN', 
                                'JOiNT', 'Joint Locks', 'ForceGeneral', 
                                'BodilyForceType', 'BD', 'Combat', 
                                'COMBAT', 'PiT', 'Leg', 'LEG'), collapse = '|'))] <- "Control Tactics (L3)"
    var[str_detect(var, paste(c('Level 4', 'level4','LEVEL 4', 
                                "LEVEL4", "Strike", 'Kick', "Physical", 
                                'Chemirritant', "OC", "SPRAY", 'CHEMiCAL', 
                                'pepper', 'Pepper', 'PEPPER', 
                                'Projectile', 'ChemIrritant'), collapse = '|'))] <- "Hard control tactics (L4)"
    var[str_detect(var, paste(c('Level 5', 'level5','LEVEL 5', "LEVEL5", 
                                "Less Lethal", 'LESS LETHAL', 'Baton', 
                                'BATON', "BatonForce", 'BATONFORCE', 
                                "Taser", "TASER", 'CED', 
                                'Conducted Energy Device', "Canine", 
                                'CANiNE', 'K9', 'K-9', 'Weapon', 'WEAPON'), collapse = '|'))] <- "Weapons (L5)"
    var[str_detect(var, paste(c('Level 6', 'level6','LEVEL 6', "LEVEL6", 
                                "Vehicle", 'FirearmType', 'Shotgun', 
                                'SHOTGUN','FiREARM', 'Firearm', 
                                'SHOTS', 'Shots'), collapse = '|'))] <- "Lethal force (L6)"
    var[str_detect(var, paste(c('-', 'Other','OTHER'), collapse = '|'))] <- "Other / missing"
    x[, i] <- as.factor(var)
    x[, i] <- droplevels(x[, i])
  }
  if(ncol(x) == 1) {
    return(as.factor(x[, 1]))
  } else {
    # Topcode use of force by most force used
    x$allvar <- apply(x , 1 , paste , collapse = "-" )
    out <- data.frame("TYPE_OF_FORCE" = sapply(x$allvar, topcode_force_level), 
                      stringsAsFactors = FALSE)
    out <- as.factor(out[,1])
    return(out)
  }
 }

 # For cases where we have multiple uses of force for one incident, let's take 
 # the most severe
 topcode_force_level <- function(x) {
  if (grepl("L6", x)) {
    return("Lethal force (L6)")
  } else if (grepl("L5", x)) {
    return("Weapons (L5)")
  } else if (grepl("L4", x)) {
    return("Hard control tactics (L4)")
  } else if (grepl("L3", x)) {
    return("Control Tactics (L3)")
  } else if (grepl("L2", x)) {
    return("Verbal commands (L2)")
  } else if (grepl("L1", x)) {
    return("Officer presence (L1)")
  } else {
    return("None")
  }
 }

 # Recode dataframe and return correct object types
 force_recode <- function(df) {
  force_vars <- grep("FORCE_USED", names(df), ignore.case = TRUE, value = TRUE)
  if (is.null(force_vars) | length(force_vars) == 0) {
    return(df)
  } else {
    df$"TYPE_OF_FORCE" <- force_6_lev(df[, force_vars, drop = FALSE])
    return(df)
  }
  
 }

 #Recode any variable with _RACE
 recode_race <- function(df) {
  race_vars <- grep("_RACE", names(df), value = TRUE)
  if(length(race_vars) > 0) {
    for(i in race_vars) {
      df[, i] <- as.character(df[, i])
      df[, i][str_detect(df[, i], "Asian")] <- "Asian"
      df[, i][str_detect (df[, i], "Black")] <- "Black"
      df[, i][str_detect(df[, i], paste(c("Hispanic", "Latino"), collapse = '|'))] <- "Hispanic"
      df[, i][str_detect(df[, i], "White")] <- "White"
      df[, i][str_detect(df[, i], paste(c("Other","Native", "Ind", "Pac"), collapse = '|'))] <- "Other"
      df[, i][str_detect(df[, i], paste(c("Unk", "UNK", 
                                          "DATA", "Data", "NULL", "Not", "specified", 
                                          "recorded"), collapse = '|'))] <- "Unknown"
      # Letter-based, for remaining categories
      df[, i][df[, i]=='A'] <- "Asian"
      df[, i][df[, i]=='B'] <- "Black"
      df[, i][df[, i]=='H'] <- "Hispanic"
      df[, i][df[, i]=='W'] <- "White"
      '%ni%' <- Negate('%in%');
      df[, i][df[, i] %ni% 
                c('Asian', 'Black', 'Hispanic', 'White') & 
                !is.na(df[, i]) ] <- "Other"
    }
    return(df)
  } else {
    return(df)
  }
 }

 # Recode times
 code_incident_time <- function(df) {
  if(!"INCIDENT_TIME" %in% names(df)) {
    return(df)
  } else {
    # TODO: Automatically find timezone
    # Handling for cases wehre the date is recorded in the time - Boston
    if(any(grep("/", df$INCIDENT_TIME))) {
      tmp_time <- df$INCIDENT_TIME
    } else {
      tmp_time <- paste(df$INCIDENT_DATE, df$INCIDENT_TIME)
    }
    df$INCIDENT_TIME <-
      parse_date_time(tmp_time,
                      orders = c("Ymd HMS", "Ymd H!M!", "Ymd HM", "Ymd HMS p",
                                 "mdy HM"))
    return(df)
  }
 }
	################################################################################
	# Functions to find the data
	################################################################################

	# Finders
	# Simple functions that take the ID code from CPE (e.g. 49-00039) and look up
	# the respective data for it in the structure provided in teh competition
	find_police_shape <- function(dept_id, kaggle_kernel = FALSE) {
	if(kaggle_kernel == TRUE) {
	prefix = "../input/cpe-data/"
	} else {
	prefix = "data/"
	}
	# This function will be modified when placed in a production environment to
	# follow the data storage/path scheme used in production. Could be modified
	# to even fetch data from a DB or from a web-service
	fl <- list.files(paste0(prefix, "Dept_", dept_id, "/", dept_id, "_Shapefiles"),
	full.names = TRUE)
	path_to_sf <- grep(".shp", fl, value = TRUE)[1]
	return(path_to_sf)
	}

	find_police_csv <- function(dept_id, kaggle_kernel = FALSE) {
	if(kaggle_kernel == TRUE) {
	prefix = "../input/cpe-data/"
	} else {
	prefix = "data/"
	}
	fl <- list.files(paste0(prefix, "Dept_", dept_id),
	full.names = TRUE)
	# What if there is more than 1 csv?
	# For now we just return first
	path_to_csv <- grep(".csv", fl, value = TRUE)[1]
	return(path_to_csv)

	}

	#############-----------------
	# Functions to read in the data
	###############--------------

	get_county <- function(city, state) {
	result <- ggmap::geocode(paste0(city, ",", state), output = "latlona", source = "google")
	# rounding?
	result[, 1] <- round(result[, 1], 2)
	result[, 2] <- round(result[, 2], 2)
	county <- latlong2county(result[, 1:2])
	county <- unlist(lapply(strsplit(county, split = ","), "[", 2))
	# Check for NA
	# return mc for missincounty
	# round to a lower resolution
	mc <- which(is.na(county))
	result[mc, 1] <- round(result[mc, 1], 1)
	result[mc, 2] <- round(result[mc, 2], 1)
	county2 <- latlong2county(result[mc, 1:2])
	county2 <- unlist(lapply(strsplit(county2, split = ","), "[", 2))
	county[mc] <- county2
	return(county)
	}

	# Retrievers
	# Need Google for geocoding the county
	get_police_zones <- function(path_to_sf, city, state, county) {
	# First we read in the shapefile
	police_districts <- rgdal::readOGR(dsn = path_to_sf,
	stringsAsFactors = FALSE)
	# Next, we standardize all police shapefiles as they come in to use a
	# projection compatible with the ACS. We do this by downloading the TIGRIS
	# file for each shapefile and getting its projection string
	admin_shapes <- tigris::tracts(state = state, county = county, cb = TRUE)
	# Extract projection
	proj_use <- CRS(paste0(admin_shapes@proj4string))
	# # Currently Austin TX is missing any projection information in the shapefile
	# # This function includes a workaround to project Austin to the coordinate
	# # reference system used by the police x,y coordinate system in the Austin
	# # data. This section could include more robust logic after analyzing additional
	# # shapefiles with non-standard projections.
	if (is.na(police_districts@proj4string)) {
	proj4string(police_districts) <- CRS("+init=epsg:3664")
	}
	# Reproject the shapefile using the spTransform function
	police_districts <- sp::spTransform(police_districts, proj_use)
	# One funny thing is that police boundary files can extend far out into the
	# water. This makes resulting maps look unfamiliar to the public. We can use
	# the Census TIGRIS files to fetch the water boundaries and then slice off
	# # any police boundaries that extend into the water
	outline <- tigris::counties(state = state,
	cb = TRUE,
	resolution = "500k")
	# Define the intersection
	outline <- rgeos::gUnaryUnion(outline)
	# Slice and return only the overlap
	police_districts <- rgeos::intersect(police_districts, outline)
	return(police_districts)
	}

	# Get the CSV police data
	get_police_data <- function(path_to_pd_data) {
	# Most of the CPE data files seem to have two headers
	# This just removes the second header and keeps the first
	# Which appears to be more standardized
	all_content = readLines(path_to_pd_data, encoding = "UTF-8")
	skip_second = all_content[-2]
	# If the datafiles got huge it could be worth considering an alternative
	# function to read the data in, but for this size read.csv works fine
	dat <- read.csv(file = textConnection(skip_second, encoding = "UTF-8"),
	stringsAsFactors = FALSE,
	header = TRUE, fileEncoding = "UTF-8 BOM")
	# CSV files can be encoded weird, some of the files here have a BOM at
	# the start of the file, a UTF-8 BOM
	# Without too much work we can just sub it out if it appears
	if (any(grepl("ï..", names(dat)))) {
	names(dat) <- gsub("ï..", replacement = "", names(dat))
	}
	# Handling for bad column name
	if ("SUBJECT_RACT" %in% names(dat)) {
	dat$SUBJECT_RACE <- dat$SUBJECT_RACT
	dat$SUBJECT_RACT <- NULL
	}
	return(dat)
	}

	# Get Census data from the API
	get_census_data <- function(state, county, geography = "tract",
	variables = NULL) {
	# We can start with a default set of Census variables, but the user
	# can pass a named character vector of variables and their Census ID number
	if (is.null(variables)) {
	message("Using default variables")
	variables <- c("total_people" = "DP05_0028",
	"total_households" = "DP02_0001",
	"total_family_households" = "DP02_0002",
	"total_white" = "DP05_0032",
	"total_black" = "DP05_0033",
	"total_hispanic" = "DP05_0066",
	"age_over_18" = "DP05_0018")
	}
	census_geo <- tidycensus::get_acs(geography = geography, variables = variables,
	state = state, county = county, geometry = TRUE)
	return(census_geo)
	}

	##################--------------------------------------------------------
	## Clean and normalize geometry data
	#################---------------------------------------------


	# Make shape file polygon dataframe have unique rownames
	makeUniform <- function(SPDF){
	pref <- substitute(SPDF) #just putting the file name in front.
	newSPDF <- spChFIDs(SPDF,
	as.character(paste(pref,
	rownames(as(SPDF,"data.frame")),
	sep = "_")))
	return(newSPDF)
	}

	# Census data bound to police polygon data
	census_to_pd_bound <- function(pd_poly, census_poly, pd_poly_id = NULL,
	type = c("sums", "means")) {
	# Use GEOID as the default
	if(is.null(pd_poly_id)) {
	pd_poly_id <- names(pd_poly@data)[1]
	# pd_poly_id <- "GEOID"
	}
	# We need to convert to a spatial frame object to take advantage of area
	# function and in so doing need to preserve the coordinate systems
	census_poly <- st_transform(census_poly, as.character(pd_poly@proj4string))

	try({
	cd_poly <- sf::as_Spatial(st_cast(census_poly, "MULTIPOLYGON"))
	})
	if(!exists("cd_poly")) {
	cd_poly <- sf::as_Spatial(st_cast(census_poly, "POLYGON"))
	}
	# Unify coordinate systems
	if(class(pd_poly) == "SpatialPointsDataFrame") {
	pd_poly <- spTransform(pd_poly, CRS(st_crs(census_poly)$proj4string))
	} else {
	pd_poly <- makeUniform(pd_poly)
	pd_poly <- spTransform(pd_poly, CRS(st_crs(census_poly)$proj4string))
	}
	# Compute the area of each tract and append it, this is needed for computing
	# the proportion lying in the police boundary
	cd_poly@data$tract_area <- area(cd_poly)
	# Find the census vars we are gong to use
	vars <- unique(cd_poly$variable)
	# set up a dataframe to store our variables in a loop
	out <- as.data.frame(setNames(replicate(length(vars),numeric(0), simplify = F),
	vars))
	# We need to loop through each polygon in the police data
	# Find every polygon it intersects with in the Census data
	# then we calculate the proportion of the Census polygon that lies within the PD
	# polygon
	# We create a weight for each Census polygon based on the proportion of its
	# area in the police polygon
	# Then, depending on our measure type, we take a weighted sum (counts) or a
	# weighted mean (percentages, rates, average income, etc.)
	for(i in unique(pd_poly@data[, pd_poly_id])){
	# Slice out one polygon at a time
	tmp_poly <- pd_poly[pd_poly@data[,pd_poly_id ]== i,]
	# Workaround for column names not stored correctly
	if(nrow(tmp_poly@data) == 0) {
	tmp_poly <- pd_poly[pd_poly@data[,pd_poly_id ]== as.character(i),]
	}
	# Compute the intersection of the census and the selected PD polygon
	zzz <- raster::intersect(cd_poly, tmp_poly)
	# calculate the intersection area
	zzz@data$intersect_area <- area(zzz)
	# Now take the proportion of the intersecting area out of the total area
	# for all the tracts that cross into the PD district
	zzz@data$weight <- zzz@data$intersect_area/zzz@data$tract_area

	if(type == "sums") {
	out_df <- zzz@data %>% group_by(variable) %>%
	mutate(weight_est = estimate * weight) %>%
	# We ignore NAs here because if the Census polygon has no data, we
	# can safely count it as 0
	summarize(estimate = sum(weight_est, na.rm = TRUE)) %>%
	mutate("id" = i) %>%
	as.data.frame()
	} else if(type == "means") {
	out_df <- zzz@data %>% group_by(variable) %>%
	summarize(estimate = weighted.mean(estimate, weight, na.rm = TRUE)) %>%
	mutate("id" = i) %>%
	as.data.frame()
	}

	out_df <- reshape(out_df, direction = "wide", timevar = "variable")
	out <- rbind(out, out_df)
	}
	# key_by <- c(pd_poly_id = "id")
	pd_poly@data <- merge(pd_poly@data, out, by.x = pd_poly_id,
	by.y = "id")
	return(pd_poly)
	}

	# Example census data variable definitions
	census_count_variables <-
	c("total_people" = "DP05_0028",
	"total_households" = "DP02_0001",
	"total_family_households" = "DP02_0002",
	"total_white" = "DP05_0032",
	"total_black" = "DP05_0033",
	"total_hispanic" = "DP05_0066",
	"age_over_18" = "DP05_0018")


	census_rate_variables <-
	c("percent_white" = "DP05_0032P",
	"occupied_housing" = "DP04_0002P",
	"health_insurance_cov" = "DP03_0096P",
	"median_family_income" = "DP03_0086")

	# Take the list of Census Data and police data and combine it into a new police
	# boundary object with the census data
	augment_census_data <- function(city_list) {
	census_sf <- get_census_data(state = city_list$state, county = city_list$county,
	geography = "tract", variables = census_count_variables)
	zzz <- census_to_pd_bound(pd_poly = city_list$pz,
	census_poly = census_sf, type = "sums")
	census_sf <- get_census_data(state = city_list$state, county = city_list$county,
	geography = "tract", variables = census_rate_variables)
	zzb <- census_to_pd_bound(pd_poly = city_list$pz,
	census_poly = census_sf, type = "means")
	zzz@data <- left_join(zzz@data, zzb@data)
	return(zzz)
	}

	# Unify police boundaries and combine police boundaries with police data
	augment_pd_census <- function(city_list) {
	if ("lat" %in% names(city_list$pd)) {
	# drop NA
	df <- city_list$pd[!is.na(city_list$pd$lat), ]
	city_points <- st_as_sf(x = df,
	coords = c("lon", "lat"),
	crs = as.character(city_list$pz@proj4string))
	out <- st_intersection(city_points, st_as_sf(city_list$pz))
	} else {
	experiment <- st_as_sf(city_list$pz)
	experiment$LOCATION_DISTRICT <- as.character(experiment$LOCATION_DISTRICT)
	city_list$pd$LOCATION_DISTRICT <- as.character(city_list$pd$LOCATION_DISTRICT)
	out <- left_join(city_list$pd, experiment, by = c("LOCATION_DISTRICT"))
	}
	return(out)
	}


	#########################---------------------------------
	# Cleaning data
	###################_-----------------------------------------

	# Functions that take a dataframe from CPE and return the dataframe with columns
	# cleaned

	detect_cpe_data_type <- function(df) {
	# Rough and ready rules for using column logic to detect the type of
	# incident data in the CPE files
	# Four possiblities for now
	# UOF
	# UOF + INJURY
	# OIS
	# SEARCH+VEHICLE
	# ARREST
	if (any(grepl("TYPE_OF_FORCE_USED", names(df)))) {
	if (any(grepl("INJURY", names(df)))) {
	return("UOF+INJURY")
	} else {
	return("UOF")
	}
	} else if (any(c("SUBJECT_INJURY_TYPE",
	"SUBJECT_DEATH") %in% names(df))) {
	return("OIS")
	}
	if (any(grepl("SEARCH", names(df)))) {
	return("SEARCH+VEHICLE")
	} else {
	return("ARREST")
	}
	}

	# Normalize and standardize incident IDs
	recode_incident_ids <- function(df) {
	if("RIN" %in% names(df)) {
	df$INCIDENT_UNIQUE_IDENTIFIER <- df$RIN
	df$RIN <- NULL
	}
	# Double ID from Boston
	if("INCIDENT_UNIQUE_IDENTIFIER.1" %in% names(df)) {
	df$INCIDENT_UNIQUE_IDENTIFIER <- paste(df$INCIDENT_UNIQUE_IDENTIFIER,
	df$INCIDENT_UNIQUE_IDENTIFIER.1, sep = "-")
	df$INCIDENT_UNIQUE_IDENTIFIER.1 <- NULL
	}
	# Seattle
	if("INCIDENT_UNIQUE.IDENTIFIER.1" %in% names(df)) {
	df$INCIDENT_UNIQUE_IDENTIFIER <- paste(df$INCIDENT_UNIQUE_IDENTIFIER,
	df$INCIDENT_UNIQUE.IDENTIFIER.1, sep = "-")
	df$INCIDENT_UNIQUE.IDENTIFIER.1 <- NULL
	}
	# Deal with UOF number for dallas
	if("UOF_NUMBER" %in% names(df)) {
	df$INCIDENT_UNIQUE_IDENTIFIER <- df$UOF_NUMBER
	df$UOF_NUMBER <- NULL
	}
	# Deal with faulty encoding
	if(!"INCIDENT_UNIQUE_IDENTIFIER" %in% names(df) &
	any(grepl("INCIDENT_UNIQUE_IDENTIFIER", names(df)))) {
	var <- grep("INCIDENT_UNIQUE_IDENTIFIER", names(df), value = TRUE)
	df$INCIDENT_UNIQUE_IDENTIFIER <- df[, var]
	df[, var] <- NULL

	}

	if(!"INCIDENT_UNIQUE_IDENTIFIER" %in% names(df)) {
	df$INCIDENT_UNIQUE_IDENTIFIER <- 1:nrow(df)
	}
	return(df)
	}

	# Calculate the proportion of a vector that is missing
	# Use to identify the quality of latitude and longitude coordinates
	prop_na <- function(x) {
	D <- length(x)
	N <- length(x[is.na(x)])
	return(round((N/D), digits = 2))
	}


	# Fix Austin data to have proper coordinates
	# Work with other variables to standardize on lat lon variable names
	normalize_location <- function(df) {
	if (any(grepl("latitude", names(df), ignore.case = TRUE))) {
	# Calculate proportion missing lat/lon
	# We can use this in the current set of data to identify alternative coordinate
	# storage.
	missing_lat <- prop_na(df[, grep("latitude", names(df), ignore.case = TRUE, value = TRUE)])
	missing_long <- prop_na(df[, grep("longitude", names(df), ignore.case = TRUE, value = TRUE)])
	if (any(c(missing_lat, missing_long) > 0.1) &
	any(grepl("Y_COORDINATE", names(df), ignore.case = TRUE))) {
	utm_coords <- df[, c("Y_COORDINATE", "Y_COORDINATE.1")]
	# Convert to numeric
	utm_coords <- sapply(utm_coords, as.numeric)
	# Trim an obvious outlier
	utm_coords <- na.omit(utm_coords[utm_coords[, 1] < 38800000, ])
	# lookup table for reference system: EPSG <- make_EPSG()
	# Close EPSG:3664
	# Look up the refernece system
	crs <- CRS("+init=epsg:3664")
	# Identical EPSG:2277
	# Convert to spatial points object
	sputm <- SpatialPoints(coords = utm_coords[, c(1, 2)],
	crs)
	# conver tto spatial points
	spgeo <- spTransform(sputm, CRS("+proj=longlat +datum=WGS84"))
	# Rejoin
	# For now let's not worry about reconciling all possible data points, we only
	# keep those we can geocode which is the vast majority
	df <- df[!is.na(as.numeric(df$Y_COORDINATE.1)), ]
	df <- df[df[, "Y_COORDINATE"] < 38800000, ]
	df[, colnames(spgeo@coords)] <- spgeo@coords
	names(df)[which(names(df) %in% colnames(spgeo@coords))] <- c("lon", "lat")
	df$Y_COORDINATE <- NULL
	df$Y_COORDINATE.1 <- NULL
	df$LOCATION_LATITUDE <- NULL
	df$LOCATION_LONGITUDE <- NULL
	} else {
	names(df)[grep("latitude", names(df), ignore.case = TRUE)] <- "lat"
	names(df)[grep("longitude", names(df), ignore.case = TRUE)] <- "lon"

	}
	}
	return(df)
	}



	# Search for closely matching factor levels between two vectors of possible IDs
	# This function could definitely be improved!
	fuzz_finder <- function(levels, target) {
	target <- as.character(target)
	levels <- list(levels)
	score <- 0
	for(i in 1:length(levels)) {
	adder <- sum(agrepl(pattern = levels[[1]][i], x = target,
	max.distance = list("deletions" = 0.25, "insertions" = 0.5,
	"cost" = 0.5)))
	adder <- ifelse(is.na(adder), 0, adder)
	score <- score + adder
	}
	return(score)
	}

	# Normalize district IDs between pz and pd by renaming the ID variable in the
	# Polygon the same as in the dataset LOCATION_DISTRICT
	align_location_districts <- function(city_list) {
	if(!"LOCATION_DISTRICT" %in% names(city_list$pd)) {
	return(city_list$pz)
	} else {
	pd_ids <- unique(city_list$pd$LOCATION_DISTRICT)
	best_score <- 0
	best_col <- 0
	for (i in 1:ncol(city_list$pz@data)) {
	score <- fuzz_finder(pd_ids, city_list$pz@data[,i])
	# score <- sum(pd_ids %in% city_list$pz@data[, i])
	best_col <- ifelse(score > best_score, i, best_col)
	best_score <- ifelse(score > best_score, score, best_score)
	}
	city_list$pz@data$LOCATION_DISTRICT <- city_list$pz@data[, best_col]
	return(city_list$pz)
	}
	}

	# Standardize USE OF FORCE LEVELS in CPE data
	force_6_lev <- function(x) {
	# Function adapted from
	# https://www.kaggle.com/fkosmowski/part-1-standardizing-use-of-force-datasets
	for(i in 1:ncol(x)) {
	var <- as.character(x[, i])
	var[str_detect(var, paste(c('Level 1', 'level1','LEVEL 1', "LEVEL1", "Display", 'display'),
	collapse = '\|'))] <- "Officer presence (L1)"
	var[str_detect(var, paste(c('Level 2', 'level2','LEVEL 2', "LEVEL2", "Verbal", "VERBAL"),
	collapse = '\|'))] <- "Verbal commands (L2)"
	var[str_detect(var, paste(c('Level 3', 'level3','LEVEL 3', "LEVEL3",
	"Pressure", "PRESSURE", 'Weight','WEiGHT', 'LVNR',
	'Handcuffing', 'Hand', 'HAND','Down', 'DOWN',
	'JOiNT', 'Joint Locks', 'ForceGeneral',
	'BodilyForceType', 'BD', 'Combat',
	'COMBAT', 'PiT', 'Leg', 'LEG'), collapse = '\|'))] <- "Control Tactics (L3)"
	var[str_detect(var, paste(c('Level 4', 'level4','LEVEL 4',
	"LEVEL4", "Strike", 'Kick', "Physical",
	'Chemirritant', "OC", "SPRAY", 'CHEMiCAL',
	'pepper', 'Pepper', 'PEPPER',
	'Projectile', 'ChemIrritant'), collapse = '\|'))] <- "Hard control tactics (L4)"
	var[str_detect(var, paste(c('Level 5', 'level5','LEVEL 5', "LEVEL5",
	"Less Lethal", 'LESS LETHAL', 'Baton',
	'BATON', "BatonForce", 'BATONFORCE',
	"Taser", "TASER", 'CED',
	'Conducted Energy Device', "Canine",
	'CANiNE', 'K9', 'K-9', 'Weapon', 'WEAPON'), collapse = '\|'))] <- "Weapons (L5)"
	var[str_detect(var, paste(c('Level 6', 'level6','LEVEL 6', "LEVEL6",
	"Vehicle", 'FirearmType', 'Shotgun',
	'SHOTGUN','FiREARM', 'Firearm',
	'SHOTS', 'Shots'), collapse = '\|'))] <- "Lethal force (L6)"
	var[str_detect(var, paste(c('-', 'Other','OTHER'), collapse = '\|'))] <- "Other / missing"
	x[, i] <- as.factor(var)
	x[, i] <- droplevels(x[, i])
	}
	if(ncol(x) == 1) {
	return(as.factor(x[, 1]))
	} else {
	# Topcode use of force by most force used
	x$allvar <- apply(x , 1 , paste , collapse = "-" )
	out <- data.frame("TYPE_OF_FORCE" = sapply(x$allvar, topcode_force_level),
	stringsAsFactors = FALSE)
	out <- as.factor(out[,1])
	return(out)
	}
	}

	# For cases where we have multiple uses of force for one incident, let's take
	# the most severe
	topcode_force_level <- function(x) {
	if (grepl("L6", x)) {
	return("Lethal force (L6)")
	} else if (grepl("L5", x)) {
	return("Weapons (L5)")
	} else if (grepl("L4", x)) {
	return("Hard control tactics (L4)")
	} else if (grepl("L3", x)) {
	return("Control Tactics (L3)")
	} else if (grepl("L2", x)) {
	return("Verbal commands (L2)")
	} else if (grepl("L1", x)) {
	return("Officer presence (L1)")
	} else {
	return("None")
	}
	}

	# Recode dataframe and return correct object types
	force_recode <- function(df) {
	force_vars <- grep("FORCE_USED", names(df), ignore.case = TRUE, value = TRUE)
	if (is.null(force_vars) \| length(force_vars) == 0) {
	return(df)
	} else {
	df$"TYPE_OF_FORCE" <- force_6_lev(df[, force_vars, drop = FALSE])
	return(df)
	}

	}

	#Recode any variable with _RACE
	recode_race <- function(df) {
	race_vars <- grep("_RACE", names(df), value = TRUE)
	if(length(race_vars) > 0) {
	for(i in race_vars) {
	df[, i] <- as.character(df[, i])
	df[, i][str_detect(df[, i], "Asian")] <- "Asian"
	df[, i][str_detect (df[, i], "Black")] <- "Black"
	df[, i][str_detect(df[, i], paste(c("Hispanic", "Latino"), collapse = '\|'))] <- "Hispanic"
	df[, i][str_detect(df[, i], "White")] <- "White"
	df[, i][str_detect(df[, i], paste(c("Other","Native", "Ind", "Pac"), collapse = '\|'))] <- "Other"
	df[, i][str_detect(df[, i], paste(c("Unk", "UNK",
	"DATA", "Data", "NULL", "Not", "specified",
	"recorded"), collapse = '\|'))] <- "Unknown"
	# Letter-based, for remaining categories
	df[, i][df[, i]=='A'] <- "Asian"
	df[, i][df[, i]=='B'] <- "Black"
	df[, i][df[, i]=='H'] <- "Hispanic"
	df[, i][df[, i]=='W'] <- "White"
	'%ni%' <- Negate('%in%');
	df[, i][df[, i] %ni%
	c('Asian', 'Black', 'Hispanic', 'White') &
	!is.na(df[, i]) ] <- "Other"
	}
	return(df)
	} else {
	return(df)
	}
	}

	# Recode times
	code_incident_time <- function(df) {
	if(!"INCIDENT_TIME" %in% names(df)) {
	return(df)
	} else {
	# TODO: Automatically find timezone
	# Handling for cases wehre the date is recorded in the time - Boston
	if(any(grep("/", df$INCIDENT_TIME))) {
	tmp_time <- df$INCIDENT_TIME
	} else {
	tmp_time <- paste(df$INCIDENT_DATE, df$INCIDENT_TIME)
	}
	df$INCIDENT_TIME <-
	parse_date_time(tmp_time,
	orders = c("Ymd HMS", "Ymd H!M!", "Ymd HM", "Ymd HMS p",
	"mdy HM"))
	return(df)
	}
	}