Created
May 30, 2018 09:48
-
-
Save monkeycycle/c9006c6e006c1408e42591d734f10fb7 to your computer and use it in GitHub Desktop.
Assorted R utilities
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load required packages | |
load_requirements <- function(pkg){ | |
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])] | |
if (length(new.pkg)) | |
install.packages(new.pkg, dependencies = TRUE) | |
sapply(pkg, require, character.only = TRUE) | |
} | |
#Returns all items in a list that are not contained in toMatch | |
#toMatch can be a single item or a list of items | |
exclude <- function (theList, toMatch){ | |
return(setdiff(theList,include(theList,toMatch))) | |
} | |
#Returns all items in a list that ARE contained in toMatch | |
#toMatch can be a single item or a list of items | |
include <- function (theList, toMatch){ | |
matches <- unique (grep(paste(toMatch,collapse="|"), | |
theList, value=TRUE)) | |
return(matches) | |
} | |
# Clean up bad XLS formatting | |
remove_comma_from_numeric <- function(s) { | |
gsub(",", "", s, fixed = TRUE) | |
} | |
# Read in x rows, format as header and sub that in as the header | |
read_datafile_2header <- function(filename, skip_rows=0, header_rows=1){ | |
filepath <- paste(data_src_path, filename, sep='') | |
# Read and format the headers which span multiple lines | |
headers <- read.csv(filepath, nrows=header_rows, header=FALSE) | |
headers_names <- sapply(headers,paste,collapse="_") | |
headers_names <- str_replace_all(headers_names, '-', '_') | |
headers_names <- str_replace_all(headers_names, ' ', '_') | |
headers_names <- str_replace_all(headers_names, '__', '_') | |
headers_names <- str_replace_all(headers_names, '__', '_') | |
df <- read.csv(file=filepath, skip = skip_rows, header=FALSE, stringsAsFactors=FALSE ) | |
names(df) <- headers_names | |
names(df) <- gsub(' ', '_', names(df), fixed = TRUE) | |
names(df) <- gsub(' ', '_', names(df), fixed = TRUE) | |
names(df) <- gsub('__', '_', names(df), fixed = TRUE) | |
names(df) <- gsub('-', '_', names(df), fixed = TRUE) | |
names(df) <- gsub(',', '', names(df), fixed = TRUE) | |
names(df) <- gsub('.', '', names(df), fixed = TRUE) | |
return(df) | |
} | |
# https://stackoverflow.com/questions/12945687/read-all-worksheets-in-an-excel-workbook-into-an-r-list-with-data-frames | |
read_excel_allsheets <- function(filename, tibble = FALSE) { | |
# I prefer straight data.frames | |
# but if you like tidyverse tibbles (the default with read_excel) | |
# then just pass tibble = TRUE | |
sheets <- readxl::excel_sheets(filename) | |
x <- lapply(sheets, function(X) readxl::read_excel(filename, sheet = X)) | |
if(!tibble) x <- lapply(x, as.data.frame) | |
names(x) <- sheets | |
x | |
} | |
cleaner_worksheet <- function(df){ | |
# # drop rows with missing values | |
# df <- df[rowSums(is.na(df)) == 0,] | |
# # remove serial comma from all variables | |
# df[,-1] <- as.numeric(gsub(",", "", as.matrix(df[,-1]))) | |
# # create numeric version of year variable for graphing | |
# df$Year <- as.numeric(substr(df$year, 1, 4)) | |
# return cleaned df | |
return(df) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment