Created
February 13, 2016 22:36
-
-
Save ronrest/578ef3111657586ecb2e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#=============================================================================== | |
# TOY DATA | |
#=============================================================================== | |
# Setting up a toy dataframe | |
df = data.frame(total_crime_2015_2016=674, | |
total_crime_2014_2015=323, | |
total_crime_2013_2014=212, | |
car_theft_2015_2016=34, | |
car_theft_2014_2015=45, | |
car_theft_2013_2014=74 | |
) | |
row.names(df) = "count" | |
#=============================================================================== | |
# GET_YEAR | |
#=============================================================================== | |
#' get_year | |
#' | |
#' Takes a string `s` that takes the form such as: | |
#' | |
#' "total_crime_2013_2014" | |
#' | |
#' And returns just the year range component of that string, eg: | |
#' | |
#' "2013_2014" | |
#' | |
#' @param s (string) the string to process | |
get_year <- function(s){ | |
# .* = any number of non-newline characters | |
# () = Anything enclosed in this, will specify a pattern | |
# that we can refer to later on. | |
# [0-9]{4}_[0-9]{4} = exactly 4 digits between 0 and 9, followed by an | |
# underscore, followed by exactly 4 more digits. | |
# $ = specifies that the last pattern should be located | |
# at the very end of the string. | |
# | |
# "\\1" = gsub looks for the above pattern we specified | |
# within the string s, and replaces it with the | |
# first pattern that we enclosed within round | |
# brackets | |
return(gsub(".*([0-9]{4}_[0-9]{4})$", "\\1", s)) | |
} | |
#=============================================================================== | |
# Extract Columns of Interest | |
#=============================================================================== | |
# extract values from columns that satisfy some regular expression pattern. | |
# eg: | |
# ^total_crime_[0-9]{4} tells us that we are : | |
# - looking for strings that START with "total_crime_" | |
# - that it should then be followed by 4 numbers | |
# This should be enough to uniquely pick out the relevant fields without overlap | |
total_crime = df[, grep("^total_crime_[0-9]{4}", names(df), ignore.case=FALSE)] | |
row.names(total_crime) = "total_crime" | |
car_theft = df[, grep("^car_theft_[0-9]{4}", names(df), ignore.case=FALSE)] | |
row.names(car_theft) = "car_theft" | |
#=============================================================================== | |
# Rename the Column Names | |
#=============================================================================== | |
# Rename the columns to only contain the years | |
names(total_crime) = sapply(names(total_crime), get_year) | |
names(car_theft) = sapply(names(car_theft), get_year) | |
#=============================================================================== | |
# Create the new DataFrame | |
#=============================================================================== | |
# Create a new dataframe with the extracted data | |
new_df = rbind(total_crime, car_theft) | |
# Creates a dataframe that looks like this | |
# 2015_2016 2014_2015 2013_2014 | |
# total_crime 674 323 212 | |
# car_theft 34 45 74 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment