ronrest · February 13, 2016 22:36
diff --git a/crime_data_reformatter.r b/crime_data_reformatter.r
 #===============================================================================
 #                                                                       TOY DATA
 #===============================================================================
 # Setting up a toy dataframe
 df = data.frame(total_crime_2015_2016=674, 
                total_crime_2014_2015=323, 
                total_crime_2013_2014=212, 
                car_theft_2015_2016=34,
                car_theft_2014_2015=45,  
                car_theft_2013_2014=74
            )
 row.names(df) = "count"


 #===============================================================================
 #                                                                       GET_YEAR
 #===============================================================================
 #' get_year
 #' 
 #' Takes a string `s` that takes the form such as: 
 #' 
 #'      "total_crime_2013_2014"
 #'      
 #' And returns just the year range component of that string, eg:
 #' 
 #'      "2013_2014"
 #'      
 #' @param s (string) the string to process
 get_year <- function(s){
    #  .*                   = any number of non-newline characters
    #  ()                   = Anything enclosed in this, will specify a pattern 
    #                         that we can refer to later on. 
    #  [0-9]{4}_[0-9]{4}    = exactly 4 digits between 0 and 9, followed by an 
    #                         underscore, followed by exactly 4 more digits. 
    #  $                    = specifies that the last pattern should be located 
    #                         at the very end of the string. 
    #
    # "\\1"                 = gsub  looks for the above pattern we specified 
    #                         within the string s, and replaces it with the 
    #                         first pattern that we enclosed within round 
    #                         brackets
    return(gsub(".*([0-9]{4}_[0-9]{4})$", "\\1", s))
 }




 #===============================================================================
 #                                                    Extract Columns of Interest
 #===============================================================================
 # extract values from columns that satisfy some regular expression pattern. 
 # eg: 
 #    ^total_crime_[0-9]{4} tells us that we are : 
 #      - looking for strings that START with "total_crime_"
 #      - that it should then be followed by 4 numbers
 # This should be enough to uniquely pick out the relevant fields without overlap
 
 total_crime = df[, grep("^total_crime_[0-9]{4}", names(df), ignore.case=FALSE)]
 row.names(total_crime) = "total_crime"

 car_theft   = df[, grep("^car_theft_[0-9]{4}", names(df), ignore.case=FALSE)]
 row.names(car_theft) = "car_theft"

 #===============================================================================
 #                                                        Rename the Column Names
 #===============================================================================
 # Rename the columns to only contain the years
 names(total_crime) = sapply(names(total_crime), get_year)
 names(car_theft) = sapply(names(car_theft), get_year)

 #===============================================================================
 #                                                       Create the new DataFrame
 #===============================================================================
 # Create a new dataframe with the extracted data
 new_df = rbind(total_crime, car_theft)


 # Creates a dataframe that looks like this
 #             2015_2016 2014_2015 2013_2014
 # total_crime       674       323       212
 # car_theft          34        45        74
	#===============================================================================
	# TOY DATA
	#===============================================================================
	# Setting up a toy dataframe
	df = data.frame(total_crime_2015_2016=674,
	total_crime_2014_2015=323,
	total_crime_2013_2014=212,
	car_theft_2015_2016=34,
	car_theft_2014_2015=45,
	car_theft_2013_2014=74
	)
	row.names(df) = "count"


	#===============================================================================
	# GET_YEAR
	#===============================================================================
	#' get_year
	#'
	#' Takes a string `s` that takes the form such as:
	#'
	#' "total_crime_2013_2014"
	#'
	#' And returns just the year range component of that string, eg:
	#'
	#' "2013_2014"
	#'
	#' @param s (string) the string to process
	get_year <- function(s){
	# .* = any number of non-newline characters
	# () = Anything enclosed in this, will specify a pattern
	# that we can refer to later on.
	# [0-9]{4}_[0-9]{4} = exactly 4 digits between 0 and 9, followed by an
	# underscore, followed by exactly 4 more digits.
	# $ = specifies that the last pattern should be located
	# at the very end of the string.
	#
	# "\\1" = gsub looks for the above pattern we specified
	# within the string s, and replaces it with the
	# first pattern that we enclosed within round
	# brackets
	return(gsub(".*([0-9]{4}_[0-9]{4})$", "\\1", s))
	}




	#===============================================================================
	# Extract Columns of Interest
	#===============================================================================
	# extract values from columns that satisfy some regular expression pattern.
	# eg:
	# ^total_crime_[0-9]{4} tells us that we are :
	# - looking for strings that START with "total_crime_"
	# - that it should then be followed by 4 numbers
	# This should be enough to uniquely pick out the relevant fields without overlap

	total_crime = df[, grep("^total_crime_[0-9]{4}", names(df), ignore.case=FALSE)]
	row.names(total_crime) = "total_crime"

	car_theft = df[, grep("^car_theft_[0-9]{4}", names(df), ignore.case=FALSE)]
	row.names(car_theft) = "car_theft"

	#===============================================================================
	# Rename the Column Names
	#===============================================================================
	# Rename the columns to only contain the years
	names(total_crime) = sapply(names(total_crime), get_year)
	names(car_theft) = sapply(names(car_theft), get_year)

	#===============================================================================
	# Create the new DataFrame
	#===============================================================================
	# Create a new dataframe with the extracted data
	new_df = rbind(total_crime, car_theft)


	# Creates a dataframe that looks like this
	# 2015_2016 2014_2015 2013_2014
	# total_crime 674 323 212
	# car_theft 34 45 74
No results found