Last active
June 17, 2024 12:29
-
-
Save BioSciEconomist/be83a5352c91c1e82a5d0846635fa483 to your computer and use it in GitHub Desktop.
Data cleaning processing and analysis in R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: R Code Pallete | |
# | DATE: 3/11/20 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: | |
# *---------------------------------------------------------------- | |
#---------------------------------- | |
# documentation templates | |
# --------------------------------- | |
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: | |
# | DATE: | |
# | CREATED BY: | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: | |
# *---------------------------------------------------------------- | |
############################################################################### | |
############################################################################### | |
################# construction zone ############################# | |
############################################################################### | |
############################################################################### | |
## | |
## new section | |
## | |
# *-----------------------------------------------------------------* | |
# | | |
# | | |
# | | |
# | reading data | |
# | | |
# | | |
# | | |
# | | |
# *------------------------------------------------------------------* | |
# identify location of data used for project by specifying a working directory | |
# alternatively, if data is stored in numerous places, you can reference | |
# the file locaton directly | |
rm(list=ls()) # get rid of any existing data | |
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation | |
cat("\f") # clear console | |
clc <- function() cat(rep("\n", 50)) | |
rm("temp") # remove specific data frames | |
ls() # view open data sets (should be empty if you just ran the code above) | |
detach("package:vegan", unload=TRUE) # unload packages that we don't need or that mask other functions | |
# sometimes we may want to unload different versions of packages - the function below handles this + | |
# provides a specific example of how Matchit loads MASS which maskes dplyer | |
# function to detach MASS package which masks the 'select' operation within dplyr used for merging data | |
# we use dplyr extensively to process data but later in the analysis using the MASS package to estimate | |
# negative binomial models (for utilization counts like ER visits) it 'masks' or prevents 'select' from working | |
# this is a problem if we were previously using MASS (or Matchit which leverages MASS) in an analysis and then wanted to run this program | |
# so we want to 'unload' or 'detach' any versions of MASS which may be loaded before trying to build the data | |
# for the analysis below leveraging dplyr functions like 'select.' This is unnecessary if MASS has not been loaded | |
# see also: https://stackoverflow.com/questions/6979917/how-to-unload-a-package-without-restarting-r | |
detach_package <- function(pkg, character.only = FALSE) | |
{ | |
if(!character.only) | |
{ | |
pkg <- deparse(substitute(pkg)) | |
} | |
search_item <- paste("package", pkg, sep = ":") | |
while(search_item %in% search()) | |
{ | |
detach(search_item, unload = TRUE, character.only = TRUE) | |
} | |
} | |
detach_package("MatchIt", TRUE) # first have to unload 'MatchIt' because it imports 'MASS' | |
detach_package("MASS", TRUE) # unload/detach MASS | |
# view specific columns | |
View(df1[,100:103]) | |
str(df1) # variable names and formats | |
str(df1, list.len=ncol(df1)) # in case R truncates the output above | |
# set working directory | |
setwd('/Users/wkuuser/Desktop/R Data Sets') # mac | |
setwd("P:\\R Code References\\R Training") # windows | |
# *------------------------------------------------------------------* | |
# | make R talk on a macbook | |
# *------------------------------------------------------------------* | |
b <- sprintf("say Pumpkin pie is the best") | |
system(b, intern = FALSE, ignore.stdout = FALSE, ignore.stderr = | |
FALSE, wait = TRUE, input = NULL) | |
# *--------------------------------------------------------------------------------------------------* | |
# | direct reading of a single variable vector from command line- similar to cards statement in SAS | |
# *------------------------------------------------------------------------------------------------* | |
GARST <- c(150,140,145,137,141,145,149,153,157,161) # read in specified values for the variable GARST | |
print(GARST) | |
GARST <- c(150,140,145,137,141,145,149,153,157,161) | |
PIO <- c(160,150,146,138,142,146,150,154,158,162) | |
MYC <- c(137,148,151,139,143,120,115,136,130,129) | |
DEK <- c(150,149,145,140,144,148,152,156,160,164) | |
PLOT <- c(1,2,3,4,5,6,7,8,9,10) | |
BT <- c('Y','Y','N','N','N','N','Y','N','Y','Y') | |
RR <- c('Y','N','Y','N','N','N','N','Y','Y','N') | |
yield_data <- data.frame(GARST,PIO,MYC,DEK,PLOT,BT,RR) | |
rm(GARST,PIO,MYC,DEK,PLOT,BT,RR) # cleanup | |
# scatter plot | |
plot(yield_data$GARST,yield_data$PIO) | |
# this is even more like the cards statement in SAS | |
toread <- "id sex age inc r1 r2 r3 | |
1 F 35 17 7 2 2 | |
17 M 50 14 5 5 3 | |
33 F 45 6 7 2 7 | |
49 M 24 14 7 5 7 | |
65 F 52 9 4 7 7 | |
81 M 44 11 7 7 7 | |
2 F 34 17 6 5 3 | |
18 M 40 14 7 5 2 | |
34 F 47 6 6 5 6 | |
50 M 35 17 5 7 5" | |
survey <- read.table(textConnection(toread), header = TRUE) | |
closeAllConnections() | |
#-------------------------------------------------- | |
# reading a file from the web | |
#--------------------------------------------------- | |
library(data.table) | |
mydat <- fread('https://raw.githubusercontent.com/dincerti/dincerti.github.io/master/data/mepsdiab.csv') | |
head(mydat) | |
# *------------------------------------------------------------------* | |
# | reading data from a txt file | |
# *------------------------------------------------------------------* | |
# data looks like this: | |
# GARST PIO MYC DEK PLOT BT RR | |
# 150 160 137 150 1 Y Y | |
# 140 150 148 149 2 Y N | |
# 145 146 151 145 3 N Y | |
# 137 138 139 140 4 N N | |
# 141 142 143 144 5 N N | |
# 145 146 120 148 6 N N | |
# 149 150 115 152 7 Y N | |
# 153 154 136 156 8 N ? | |
# 157 158 130 160 9 Y Y | |
# 161 162 129 164 10 Y N | |
yield_data<-read.table("Yield_plots.txt", header=TRUE) # read text file | |
names(yield_data) # var list | |
print(yield_data) # print data set | |
yield_data # note: print() is not necessary to print | |
# get all factor variables | |
is.fact <- sapply(yield_data, is.factor) | |
tmp <- yield_data[,is.fact] | |
# *------------------------------------------------------------------* | |
# | reading data from a csv file | |
# *------------------------------------------------------------------* | |
# data looks like this | |
# HYBRID TRAIT P1 P2 P3 | |
# GARST RR 150 140 145 | |
# MYC BT 160 150 146 | |
# PIO RR 137 148 151 | |
# DEK BT 150 149 145 | |
plots <- read.csv("plots.csv", na.strings=c(".", "NA", "", "?"), encoding="UTF-8") | |
plots | |
# create function to reformat column names from csv files (from:https://stackoverflow.com/questions/17152483/how-to-replace-the-in-column-names-generated-by-read-csv-with-a-single-spa ) | |
makeColNamesUserFriendly <- function(ds) { | |
# FIXME: Repetitive. | |
# Convert any number of consecutive dots to a single space. | |
names(ds) <- gsub(x = names(ds), | |
pattern = "(\\.)+", | |
replacement = "_") | |
# Drop the trailing spaces. | |
names(ds) <- gsub(x = names(ds), | |
pattern = "( )+$", | |
replacement = "") | |
ds | |
} | |
# create unique ID for each line in data frame | |
tmp_claims$RowID <- 1:nrow(tmp_claims) | |
# convert index to column | |
names <- rownames(yield_data) | |
yield_data <- cbind(names,yield_data) | |
# repair read error for first ID (somethimes csv file read cause the first element in file to misread) | |
df$ID <- as.character(df$ID) | |
df[1,1] <- '123456' | |
# *------------------------------------------------------------------* | |
# | reading data from a excel xlsx | |
# *------------------------------------------------------------------* | |
xlsxFile <- "C:\\Users\\mtb2901\\Documents\\Data Science Projects\\Projects 2018\\201801 Hospital Risk Profile\\Raw Data\\TX-Hospital-Tricare-Network.xlsx" | |
# reference: https://www.rdocumentation.org/packages/openxlsx/versions/4.0.17/topics/read.xlsx | |
# read.xlsx(xlsxFile, sheet = 1, startRow = 1, colNames = TRUE, | |
# rowNames = FALSE, detectDates = FALSE, skipEmptyRows = TRUE, | |
# skipEmptyCols = TRUE, rows = NULL, cols = NULL, check.names = FALSE, | |
# namedRegion = NULL, na.strings = "NA", fillMergedCells = FALSE) | |
vars <- read.xlsx("FraudT3CB14jun18HEADER.xlsx",sheetIndex = 1) # read header file with variable names | |
#------------------------------------- | |
# writing files | |
#-------------------------------------- | |
# ex without rownames | |
write.csv(opioid_dat, file = "//Documents//Briefcase///Data//opioid_dat.csv", row.names = FALSE) | |
# export as R data file | |
save(MBR_FACT_ANLY, file = "//projects//Data//MBR_FACT_ANLY.RData") | |
# export as text with tab dilemeter | |
write.table(df1, //Projects//Data//df1.txt",row.names = FALSE, sep="\t") | |
# *-----------------------------------------------------------------* | |
# | | |
# | | |
# | | |
# | printing and subsetting data | |
# | | |
# | | |
# | | |
# | | |
# *------------------------------------------------------------------* | |
# print selected variables | |
yield_data[c("GARST")] | |
# subset data via variable selection | |
my_hybrids <- yield_data[ c("GARST", "PIO")] | |
print(my_hybrids) | |
# subset based on a single variable | |
GARST<-yield_data[c("GARST")] | |
print(GARST) | |
df <- df[ , !names(df) %in% c("xvar")] # drop field | |
# print a subset of data based on observed variable values | |
print(yield_data[yield_data$GARST==150 & yield_data$PIO==160,]) | |
# subset data based on observed values | |
high_yields <- yield_data [ yield_data$GARST==150 & yield_data$PIO==160,] | |
print(high_yields) | |
stacked_traits <-yield_data[ yield_data$BT =="Y" & yield_data$RR =="Y",] | |
stacked_traits | |
# subset based on just one variable value | |
LOW_GARST<-yield_data[GARST==140,] | |
print(LOW_GARST) | |
LOW_GARST<-yield_data[GARST < 150,] | |
print(LOW_GARST) | |
RR<- yield_data[yield_data$RR =="Y",] | |
RR | |
Bt<- yield_data[yield_data$BT=="Y",] | |
Bt | |
# subset based on %in% | |
vc <- c(150,145,195) | |
tmp <- yield_data[yield_data$GARST %in% vc,] | |
# or | |
tmp <- yield_data[yield_data$GARST %in% c(150,145,195,161),] | |
# this might also works using dplyr (which allows for additional manipulation if you want to | |
# 'pipe' in additional aggregations or calculations) | |
library(dplyr) | |
tmp <- yield_data %>% | |
filter(GARST %in% c(150,145,195)) | |
# subset based on 'not' %in% using dplyr | |
tmp <- filter(yield_data,!BT %in% c('Y')) | |
library(sqldf) | |
tmp <- sqldf('select * from yield_data where GARST in(150,145,195)') | |
#------------------------------------------------ | |
# connect to SQL server | |
#------------------------------------------------ | |
# read initial analysis file | |
library(RODBC) | |
# connecct to the SQL Server LOUSQLWTS853 | |
DB_Connection <- odbcDriverConnect('driver={SQL Server};server=arlsql123;database=ANLY_SBOX;trusted_connection=true') | |
# connect to specific table (see documentation to limit rows etc.) | |
MBR_FACT_ANLY <- sqlFetch(DB_Connection, "mbr_fact_anly") | |
# export as R data file for future use(so we don't have to hit SQL every time) | |
save(MBR_FACT_ANLY, file = "//projects//mtt94674//Data//MBR_FACT_ANLY.RData") | |
# make sure to close your database connection | |
close(DB_Connection) | |
# *-----------------------------------------------------------------* | |
# | | |
# | | |
# | | |
# | summary statistics | |
# | | |
# | | |
# | | |
# | | |
# *------------------------------------------------------------------* | |
summary(yield_data) # whole data set | |
summary(yield_data[c("GARST")]) # single variable | |
summary(yield_data$GARST) # single variable (more efficient) | |
summary(yield_data[c("GARST" , "PIO")]) # selected variables | |
summary(sqrt(yield_data$GARST) ) # transformation | |
# by processing | |
by(yield_data, yield_data$BT, summary) | |
# using dplyr | |
yield_data%>% | |
group_by(BT) %>% # group by s | |
summarize(mean = mean(GARST)) # mean | |
### more examples with dplyr | |
# how many members per risk group (members are duplicated by transactions) | |
# as such the same member can have more than one transaction scored by the model | |
# that are in the same risk group and a member can belong to multiple | |
# risk groups if they have multiple transactions scored accordingly | |
### brute force example | |
tmp <- df4%>%filter(riskcat == '1:LOW')%>%distinct(mbrid) | |
tmp <- df4%>%filter(riskcat == '2:MOD')%>%distinct(mbrid) | |
tmp <- df4%>%filter(riskcat == '3:HI')%>%distinct(mbrid) | |
tmp <- df4%>%filter(riskcat == '4:VHI')%>%distinct(mbrid) | |
### a little less brute force | |
tmp1 <- df4%>%select(riskcat,mbrid)%>%arrange(riskcat,mbrid) # sort by risk and ID | |
tmp2 <-tmp1%>%group_by(riskcat)%>%distinct(mbrid) # get unique IDs per risk strata | |
# how many members per strata | |
tmp2%>% | |
group_by(riskcat)%>% | |
summarize(total = n()) | |
### can we do the things above in one step | |
df4%>%select(riskcat,mbrid)%>%arrange(riskcat,mbrid)%>%group_by(riskcat)%>%distinct(mbrid)%>% | |
summarize(total = n()) | |
library(Hmisc) | |
# frequencies | |
library(Hmisc) | |
describe(yield_data) | |
# contents of data set - (Hmisc) | |
contents(yield_data) | |
# aggregations and crosstabulations | |
summary(ACTIV_PARTICIP ~ AGECAT, data=hc_mbr_anly, fun=mean) | |
# tables | |
tmp <- data.frame(table(complications$Measure_ID)) # saved as an easily readable data frame | |
aggregate(ACTIV_PARTICIP~RUCC_2013_DESC,hc_mbr_anly,mean) | |
CROP <- c('Cotton','Cotton','Corn','Corn','Corn','SB','SB','SB','SB','SB') | |
RR <- c('Y','N','Y','N','N','N','N','Y','Y','N') | |
yield_data <- data.frame(CROP,RR) | |
rm(CROP,RR) # cleanup | |
mytable <-table(yield_data$RR,yield_data$CROP) | |
tmp <- data.frame(mytable) | |
names(tmp) <- c("RR","Crop","Total") | |
print(tmp) | |
tmp <- data.frame(prop.table(mytable)) # cell percentages | |
names(tmp) <- c("RR","Crop","Pct") | |
print(tmp) | |
tmp <- data.frame(prop.table(mytable, 1)) # row percentages (%Crop within RR status) | |
names(tmp) <- c("RR","Crop","RowPct") | |
print(tmp) | |
tmp <- data.frame(prop.table(mytable, 2)) # column percentages (%RR within Crop) | |
names(tmp) <- c("RR","Crop","ColPct") | |
print(tmp) | |
# cross tabulations across a range of variables | |
# average garst across BT and RR | |
is.fact <- sapply(yield_data, is.factor) | |
tmp <- yield_data[,is.fact] | |
vars <- names(tmp) | |
for(i in 1:2) { | |
print("+-+-+-+-+-+-+-+-+-+-+-+-") | |
print(vars[i]) | |
print("+-+-+-+-+-+-+-+-+-+-+-+-") | |
print(by(yield_data$GARST, yield_data[c(vars[i])],mean)) | |
} | |
# cross tablulations across a range of columns in a data frame | |
names(sb) # list columns for reference | |
xvar <- as.list(names(sb)) # save names as a list to reference in loop | |
# loop across fields and produce frequencies | |
for(i in 1:36) { | |
print(xvar[i]) | |
print(data.frame(table(sb[c(paste(xvar[i]))]))) | |
} | |
aggregate(totmthcov~pre_term,df1,mean) | |
# using dplyr | |
df1%>% | |
summarize(mean = mean(prob,na.rm = TRUE), | |
min = min(prob,na.rm = TRUE), | |
Q1=quantile (prob, probs=0.25,na.rm = TRUE), | |
med = median(prob,na.rm = TRUE), | |
Q3=quantile(prob, probs=0.75,na.rm = TRUE), | |
P85=quantile(prob, probs=0.85,na.rm = TRUE), | |
P90=quantile(prob, probs=0.90,na.rm = TRUE), | |
P95=quantile(prob, probs=0.95,na.rm = TRUE), | |
P99=quantile(prob, probs=0.99,na.rm = TRUE)) | |
# aggregate totals per person | |
df5 <- df4%>% | |
group_by(ID) %>% | |
summarize(TOTAL_ALLOW_PRE = sum(TOT_ALLOWED_AMT)) | |
xtabs(~ BT, data = yield_data) # counts of types of BT | |
xtabs(~ BT + RR, data = yield_data) # BT by RR | |
xtabs(GARST ~ RR, data=yield_data) # total RR for GARST | |
# categorical frequencies by treatment status | |
mytable <- table(df1$PrimaryDiagDesc,df1$treat) | |
prop.table(mytable, 2) # column percentage | |
# *-----------------------------------------------------------------* | |
# | | |
# | | |
# | | |
# | creating and adding new variables | |
# | | |
# | | |
# | | |
# | | |
# *------------------------------------------------------------------* | |
# create a new data set that includes a new variable that is | |
# the sum of "garst" and "pio" | |
yield_data2<-transform(yield_data, sum=GARST+PIO) | |
print (yield_data2) | |
# add a new variable to existing data set that is | |
# the sum of all hybrids | |
#first create data set HYBRIDS | |
hybrids<-yield_data | |
print(hybrids) | |
# add new variable to hybrids | |
hybrids<-transform(hybrids, SUM=GARST+PIO+MYC+DEK) | |
print(hybrids) | |
# or add 2 new variables | |
hybrids<-transform(hybrids, SUM=GARST+PIO+MYC+DEK, | |
MEAN=SUM/4) | |
print(hybrids) | |
# combine or concatenating two columns | |
yield_data$x <- paste(yield_data$RR,yield_data$BT) | |
yield_data$x <- paste(yield_data$RR, "-",yield_data$BT) | |
df$x <- paste(df$n,"-",df$s) for inserting a separator. | |
# *-----------------------------------------------------------------* | |
# | | |
# | | |
# | | |
# | conditional processing | |
# | | |
# | | |
# | | |
# | | |
# *------------------------------------------------------------------* | |
# recall from the data set there are 10 plots | |
# first assume that in plot 3 we accidentally planted a Bt | |
# garst hybrid that was supposed to be a non-Bt hybrid | |
# so we don't want to include it in the non Bt | |
# average for hybrids- we can use 'ifelse' conditions | |
# to exclude this from the mean calculation | |
# first create data set trials | |
trials<-yield_data | |
print(trials) | |
print(trials$PLOT) | |
trials$AVG <- ifelse ( trials$PLOT == 3 , | |
(trials$AVG <- (trials$PIO+trials$MYC+trials$DEK)/3), # excludes GARST in the calculation if PLOT = 3 | |
(trials$AVG <- (trials$GARST+trials$PIO+trials$MYC+trials$DEK)/4 ) ) # for all other plots | |
# creates new variable AVG in data set trials | |
trials | |
# another example - notice the nexting of the ifelse conditions: | |
grades$letter <- ifelse(grades$score >= .90,"A", ifelse(grades$score >= .80 & grades$score < .90, "B",ifelse (grades$score >= .70 & grades$score < .80,"C",ifelse(grades$score >= .60 & | |
grades$score < .70, "D","F")))) | |
table(grades$letter) | |
# create binary target | |
banking$target <- ifelse(banking$y =="no",0,1) | |
# model calibration / risk stratification | |
df1$risk <- ifelse(df1$prob < .12,"1:LOW", | |
ifelse(df1$prob >= .12 & df1$prob < .30, "2:MOD", | |
ifelse(df1$prob >= .3 & df1$prob < .60, "3:HI","4:VHI"))) | |
table(df1$risk) | |
# age categorization | |
df1$AgeCat <- ifelse(df1$age < 18,'Under 18', | |
ifelse(df1$age >= 18 & df1$age <= 25,'18-25', | |
ifelse(df1$age > 25 & df1$age <= 29, '26-29', | |
ifelse(df1$age > 29 & df1$age <= 39,'30-39', | |
ifelse(df1$age > 39 & df1$age <= 49, '40-49', | |
ifelse(df1$age > 49 & df1$age <= 59, '50-59', | |
ifelse(df1$age >59 & df1$age <= 64, '60-64','65-65+'))))))) | |
# *-----------------------------------------------------------------* | |
# | | |
# | | |
# | | |
# | stacking / concatenating/ adding data sets | |
# | | |
# | | |
# | | |
# | | |
# *------------------------------------------------------------------* | |
# get only bt hybrids- same as subsetting | |
BT_HYBRIDS <- yield_data[yield_data$BT=="Y", ] | |
print(BT_HYBRIDS) | |
# get only non bt hybrids | |
NON_BT_HYBRIDS<- yield_data[yield_data$BT=="N",] | |
print(NON_BT_HYBRIDS) | |
# *------------------------------------------------------------------* | |
# | combine with the rbind function | |
# *------------------------------------------------------------------* | |
both<-rbind(BT_HYBRIDS,NON_BT_HYBRIDS) | |
print(both) | |
# *------------------------------------------------------------------* | |
# | stack with SQL using sqldf | |
# *------------------------------------------------------------------* | |
library(sqldf) | |
library(ggplot2) | |
both_sql = sqldf(' | |
select * | |
from BT_HYBRIDS union all select * from NON_BT_HYBRIDS | |
order by PLOT | |
') | |
# note: the 'all' statement keeps all rows, if you omit 'all' you get only unique rows | |
both_sql | |
# *-----------------------------------------------------------------* | |
# | | |
# | | |
# | | |
# | | |
# | merging based on common variables | |
# | | |
# | | |
# | | |
# *------------------------------------------------------------------* | |
# first split the data, one set contains plot, bt, rr, garst, pio | |
# the other set has plot, rr, bt, myc, dek | |
# we'll work under the assumption that the first set is 'early' | |
# hybrids and the second set is 'late' hybrids | |
# as if we collected data on these separately and want to combine the | |
# data set to get one set, like we started with with yield_data | |
# create early / late planting data set | |
early<-yield_data[c("PLOT","GARST","PIO","BT","RR")] | |
early | |
late<-yield_data[c("PLOT","MYC","DEK","BT","RR")] | |
late | |
# *------------------------------------------------------------------* | |
# | merge by common variables- PLOT, BT, RR | |
# *------------------------------------------------------------------* | |
hybrids<-merge(early,late,by=c("PLOT","BT","RR")) | |
hybrids | |
# *------------------------------------------------------------------* | |
# | merging by a single common variable | |
# *------------------------------------------------------------------* | |
# the above merges back to the orignianl data set, but | |
# we had to be careful to merge by ALL of the common variables | |
# suppose we had the following data set strip_trial.txt | |
# HYBRID YIELD TRAIT | |
# GARST 150 BT | |
# MYC 140 RR | |
# PIO 160 RR | |
# DEK 145 BT | |
# lets read it in and split it up to form 2 data sets 'yields' | |
# and 'traits' with a SINGLE common variable 'HYBRID' to merge by | |
#'yields' | |
# HYBRID YIELD | |
# GARST 150 | |
# MYC 140 | |
# PIO 160 | |
# DEK 145 | |
# 'traits' | |
# HYBRID TRAIT | |
# GARST BT | |
# MYC RR | |
# PIO RR | |
# DEK BT | |
# read data 'yields' | |
yields <- read.table("yields.txt", header=TRUE) # read text file | |
yields | |
# read data 'traits' | |
traits <- read.table("traits.txt", header=TRUE) # read text file | |
traits | |
# now to demonstate merging data sets by a common variable | |
field_data<-merge(yields,traits,by=c("HYBRID")) | |
field_data | |
# *------------------------------------------------------------------* | |
# | merging with SQL using sqldf | |
# *------------------------------------------------------------------* | |
# library(sqldf) | |
# library(ggplot2) | |
# left join yields and traits on HYBRID | |
field_data_sql = sqldf(' | |
select a.HYBRID,a.YIELD,b.TRAIT | |
from yields a left join traits b | |
on a.HYBRID =b.HYBRID | |
order by a.YIELD') | |
field_data_sql | |
# left join early and late (from above) on PLOT- note for each plot RR and BT are the | |
# same regardless of the hybrid so specifying BT and RR for only one data set will suffice, | |
# but we need to join on PLOT this seems to give a much more straightforward solution to the | |
# merge above using the R merge statement | |
hybrids_sql = sqldf(' | |
select a.PLOT,a.GARST,a.PIO, | |
b.MYC, b.DEK, a.BT, a.RR | |
from early a left join late b | |
on a.PLOT =b.PLOT | |
order by a.PLOT') | |
hybrids_sql | |
#----------------------------------------------------- | |
# merging using dplyr | |
#---------------------------------------------------- | |
tmp3 <- tmp2%>% left_join(tmp1, by = "State") | |
# get all months between EFFMTH and ENDMTH in cohort history file | |
monthlist <- df1%>%left_join(select(tmp5,MBRID,Month,covmth), | |
by = "MBRID")%>%filter(Month >= EFFMTH & Month <= ENDMTH) | |
monthlist <- monthlist %>%arrange(MBRID,Month) | |
# more complicated join wiht dplyr | |
# merge claims data with cohort file keeping only claims between the coverage periods | |
df3 <- df2%>%select(MBRID,EFFMTH,ENDMTH,INDEX_MTH_DT,pre12,post12,claimflag)%>%left_join(select(raw_claims,MBRID = PID,Claim_Number,SVC_FROM_DT,TOT_ALLOWED_AMT), | |
by = "DMBRID")%>%filter(SVC_FROM_DT >= EFFMTH & SVC_FROM_DT <= ENDMTH) | |
# *-----------------------------------------------------------------* | |
# | | |
# | | |
# | | |
# | sorting data | |
# | | |
# | | |
# | | |
# *------------------------------------------------------------------* | |
# use file plots2 weed pressure | |
# LOCATION TRAITS P1 P2 P3 P4 | |
# 1 RR 1 1 5 1 | |
# 2 RR 2 1 4 1 | |
# 1 RR 2 2 4 3 | |
# 2 RR 3 1 NA 3 | |
# 1 BT 4 5 2 4 | |
# 2 BT 5 4 5 5 | |
# 1 BT 5 3 4 4 | |
# 2 BT 4 5 5 5 | |
plots2<- read.table("plots2.txt", header =TRUE) | |
plots2 | |
# sort data by location | |
plotsSorted<-plots2[order(plots2$LOCATION),] | |
plots2 | |
plotsSorted | |
# sort by trait then location | |
plots2Sorted<-plots2[order(plots2$TRAITS, plots2$LOCATION),] | |
plots2Sorted | |
# for descending order, prefix any variable with a minus sign | |
plots2Sorted<-plots2[order(-plots2$LOCATION,plots2$TRAITS),] | |
plots2Sorted | |
# dplyr | |
tmp <- arrange(tmp,desc(Freq)) | |
# *-----------------------------------------------------------------* | |
# | | |
# | | |
# | | |
# | graphics | |
# | | |
# | | |
# | | |
# *------------------------------------------------------------------* | |
# *------------------------------------------------------------------* | |
# | examples of r graphics capabilities | |
# *------------------------------------------------------------------* | |
demo(graphics) | |
demo(persp) | |
library(lattice) | |
demo(lattice) | |
# *------------------------------------------------------------------* | |
# | basic plots | |
# *------------------------------------------------------------------* | |
# histogram | |
hist(yield_data$GARST) | |
# bar plot | |
barplot(yield_data$GARST, main="Garst Yields", xlab ="Plots",col =c("darkblue","red"), legend =rownames(yield_data$PLOT)) | |
# bar plot (example) | |
counts <- mytable$Freq[mytable$Freq > 8] | |
barplot(counts, main="Conditions of Members", | |
xlab="Conditions", col=c("darkblue","red"), | |
legend = mytable$Var1[mytable$Freq > 8], beside=False) | |
# using xtabs to create plots | |
plot(~xtabs(GARST ~ RR, data=yield_data)) # barplot | |
plot(xtabs(~ BT, data = yield_data)) # barplot | |
plot(xtabs(~ BT + RR, data = yield_data)) # mosaic plot | |
# scatter plot | |
plot(yield_data$GARST,yield_data$PIO) | |
# scatterplot matrix of whole data set | |
plot(yield_data) | |
# scatterplot matrix- data used in regression in analysis section | |
land_prices<- read.table("land_prices.txt", header =TRUE) | |
land_prices | |
plot(land_prices) | |
# brushed scatterplot - as in model visualization | |
ggplot(df1,aes(x=SUMRX_NONLTOT_QB_GE120MEDD,y=SUMRX_LTOT_QB_GE120MEDD, shape = risk,color = risk)) + | |
scale_color_manual(values=c("green","yellow", "red","red")) + geom_point() | |
# labels + jittering | |
set.seed(456) | |
ggplot(df1,aes(x=SUMRX_NONLTOT_QB_GE120MEDD,y=SUMRX_LTOT_QB_GE120MEDD, shape = risk,color = risk)) + | |
scale_color_manual(values=c("green","yellow", "red","red")) + geom_point() + geom_jitter(width = 2, height =2) + | |
xlab("Total Non-Qualified >= 120") + | |
ylab("Total Qualified >= 120") + | |
ggtitle("Observed Rx Counts Overlaid by Predicted Risk/Stratification") | |
# *------------------------------------------------------------------* | |
# | stacked bar chart | |
# *------------------------------------------------------------------* | |
# get data | |
bytrait<- read.table("biotech_yield.txt", header =TRUE) | |
bytrait | |
names(bytrait) | |
# summarize data for stacking by trait | |
counts <- table(bytrait$RR, bytrait$YIELD) # list stacked variable first | |
counts | |
barplot(counts, main ="Yield by Trait for Garst",xlab ="Yield Category", col =c("darkblue","red"),legend =rownames(counts)) | |
#-------------------------------------------- | |
# pie and bar charts in ggplot | |
#-------------------------------------------- | |
table(df$pre_term) # get values by group | |
tmp <- data.frame( | |
group = c("Full-Term", "Pre-Term"), | |
value = c(7999,6000) | |
) | |
# stacked bar | |
bp <- ggplot(tmp, aes(x="", y=value, fill=group))+ | |
geom_bar(width = 1, stat = "identity") | |
bp | |
# pie | |
pie <- bp + coord_polar("y", start=0) | |
pie | |
pie + scale_fill_manual(values=c("red","blue")) | |
# *------------------------------------------------------------------* | |
# | basic density plots | |
# *------------------------------------------------------------------* | |
# plot and save histogram data | |
garsths <- hist(yield_data$GARST, main = "Garst Yield Histogram") | |
print(garsths) | |
# compute density estimates using default gaussian & rule of thumb bandwidth | |
garstdens <- density(x=yield_data$GARST, bw="nrd0",kernel="gaussian",n=20) | |
print(garstdens) | |
# rescale density so it will plot on the same graph as GARST histogram | |
rs <- max(garsths$counts/max(garstdens$y)) # create scaling factor for plotting the density | |
lines(garstdens$x, garstdens$y*rs, col=2) # plot density over histogram using lines statment | |
# compare this to just plotting the non-rescaled density over the histogram | |
lines(density(x=yield_data$GARST, bw="nrd0", n=20),col=3) # green line barely shows up | |
# turn off/close graphics window | |
dev.off() | |
# *------------------------------------------------------------------* | |
# | overlapping density plots | |
# *------------------------------------------------------------------* | |
library(colorspace) # package for rainbow_hcl function | |
# Generate just the data for a histogram of GARST grouping by RR trait | |
ds <- rbind(data.frame(dat=yield_data[,][,"GARST"], grp="All"), | |
data.frame(dat=yield_data[,][yield_data$RR=="N","GARST"], grp="N"), | |
data.frame(dat=yield_data[,][yield_data$RR=="Y","GARST"], grp="Y")) | |
# histogram for all GARST data | |
hs <- hist(ds[ds$grp=="All",1], main="", xlab="GARST", col="grey90", ylim=c(0, 9.46395633979357), breaks="fd", border=TRUE) | |
# compute density, rescale, and plot for all GARST data | |
dens <- density(ds[ds$grp=="All",1], na.rm=TRUE) | |
rs <- max(hs$counts)/max(dens$y) | |
lines(dens$x, dens$y*rs, type="l", col=rainbow_hcl(3)[1]) | |
# compute density, rescale, and plot where RR = 'N' | |
dens <- density(ds[ds$grp=="N",1], na.rm=TRUE) | |
rs <- max(hs$counts)/max(dens$y) | |
lines(dens$x, dens$y*rs, type="l", col=rainbow_hcl(3)[2]) | |
# compute density, rescale, and plot where RR = 'Y' | |
dens <- density(ds[ds$grp=="Y",1], na.rm=TRUE) | |
rs <- max(hs$counts)/max(dens$y) | |
lines(dens$x, dens$y*rs, type="l", col=rainbow_hcl(3)[3]) | |
# Add a rug to illustrate density | |
rug(ds[ds$grp=="N", 1], col=rainbow_hcl(3)[2]) | |
rug(ds[ds$grp=="Y", 1], col=rainbow_hcl(3)[3]) | |
# Add a legend to the plot. | |
legend("topright", c("All", "N", "Y"), bty="n", fill=rainbow_hcl(3)) | |
# Add a title to the plot. | |
title(main="Distribution of GARST | |
by RR", sub=paste("Created Using R Statistical Package", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"])) | |
### pie chart | |
table(tmp$Enrollee) # get counts | |
slices <- c(3111,14141) # define slices | |
lbls <- c("True","False") # slice labels | |
pct <- round(slices/sum(slices)*100,digits =2) # calc pct | |
lbls <- paste(lbls, pct) # add calculated percents to labels | |
lbls <- paste(lbls,"%",sep="") # ad % to labels | |
pie(slices,labels = lbls, col=rainbow(length(lbls)), | |
main="Members") | |
### aggregate time series and produce multiple line chart (dply & ggplot2) | |
# subset only top 5 departmetns for OT Hrs | |
tmp2<- | |
filter(df2,Role=="Specialist"|Role=="Finder"|Role=="Customer Care Specialist"|Role =="Clinical"|Role=="Content Specialist") | |
# aggregate by department and date | |
tmp3 <- tmp2%>% | |
group_by(Role,End_Date) %>% | |
summarize(TotalOTHrs = sum(Overtime_Hours)) | |
ggplot(tmp3,aes(End_Date,TotalOTHrs,colour=Role)) + geom_line() + geom_point() | |
### aggregate and simple plot | |
# aggregate & visualize total OT by pay period | |
tmp <- df2%>% | |
group_by(End_Date) %>% | |
summarize(TotalOTHrs = sum(Overtime_Hours)) | |
plot(tmp$End_Date,tmp$TotalOTHrs,type="b", xlab = "End Date", ylab = "Total OT Hrs") | |
#------------------------------------------------ | |
# side by side plots | |
#------------------------------------------------ | |
par(mfrow=c(1,2)) # set the plotting area into a 1*2 array | |
hist(treat,breaks = 50) | |
hist(ctrl, breaks = 50) | |
# *-----------------------------------------------------------------* | |
# | | |
# | | |
# | | |
# | analysis | |
# | | |
# | | |
# | | |
# *------------------------------------------------------------------* | |
# *------------------------------------------------------------------* | |
# | t-tests | |
# *------------------------------------------------------------------* | |
# independent samples t-test | |
t.test(yield_data$GARST,yield_data$PIO ) # difference in GARST vs PIO yields from yield_data file | |
t.test(plots2$P4[plots2$TRAITS=='RR'],plots2$P4[plots2$TRAITS=='BT'] ) # difference in weed pressure for bt and rr traits in plot 4 | |
# paried t-test | |
t.test(plots2$P1,plots2$P2,paired=TRUE) # differences in overall weed pressure for plots 1 and 2 (more appropriately used on same subjects- before and after tests) | |
# example if plot 1 was before herbicide application while plot 2 was the same plot measured for weed | |
pressure after application | |
t.test(yield_data$GARST,yield_data$PIO,paired=TRUE) # not actually appropriate but notice the difference in the results from the previous t-test on | |
# GARST and PIO | |
# *------------------------------------------------------------------* | |
# | measures of association | |
# *------------------------------------------------------------------* | |
# pearson correlations - no sig tests | |
rcorr(cbind(plots2$P1,plots2$P2,plots2$P3,plots2$P4)) # gives pearson correlations | |
# the cor.test has p-value, ci, but only 2 vars at a time | |
cor.test(plots2$P1,plots2$P2,use="pairwise") | |
# spearman correlations using the hmisc rcorr function | |
rcorr( cbind(plots2$P1,plots2$P2,plots2$P3,plots2$P4), type='spearman') | |
rcorr( cbind(plots2$P1,plots2$P2,plots2$P3,plots2$P4), type='pearson') # compare, also same as defalt | |
# *------------------------------------------------------------------* | |
# | simple linear regression | |
# *------------------------------------------------------------------* | |
# lets use another data set land_prices | |
# PRICE ACRES IMPROVEMENTS | |
# 36 9 8 | |
# 80 15 7 | |
# 44 10 9 | |
# 55 11 10 | |
# 35 10 6 | |
land_prices<- read.table("land_prices.txt", header =TRUE) | |
land_prices | |
plot(land_prices) # scatterplot of dependent and independent variables | |
myRegModel<-lm(PRICE~ACRES,data=land_prices) # run regression | |
summary(myRegModel) # view estimates | |
anova(myRegModel) | |
# simple plots- plot data and regresion line | |
plot(land_prices$ACRES,land_prices$PRICE) | |
abline(myRegModel) | |
# automated plots | |
plot(myRegModel) | |
termplot(myRegModel) | |
# *------------------------------------------------------------------* | |
# | multipe linear regression | |
# *------------------------------------------------------------------* | |
myRegModel<-lm(PRICE~ACRES+IMPROVEMENTS,data=land_prices) # run regression | |
summary(myRegModel) # view estimates | |
anova(myRegModel) | |
plot(myRegModel) | |
termplot(myRegModel) | |
# *------------------------------------------------------------------* | |
# | analysis of variance | |
# *------------------------------------------------------------------* | |
# look at data set 'yield_data' | |
yield_data<-read.table("Yield_plots.txt", header=TRUE) # read text file | |
print(yield_data) # print data set | |
# data looks like this: | |
# GARST PIO MYC DEK PLOT BT RR | |
# 150 160 137 150 1 Y Y | |
# 140 150 148 149 2 Y N | |
# 145 146 151 145 3 N Y | |
# 137 138 139 140 4 N N | |
# 141 142 143 144 5 N N | |
# 145 146 120 148 6 N N | |
# 149 150 115 152 7 Y N | |
# 153 154 136 156 8 N ? | |
# 157 158 130 160 9 Y Y | |
# 161 162 129 164 10 Y N | |
# we would like to set up a design such as this: | |
# let treatments = hybrid, blocks = plot | |
#------------------------------------------------# | |
# HYBRID | |
# GARST PIO MYC DEK | |
# PLOT 1 150 137 160 150 | |
# PLOT 2 140 148 150 149 | |
# PLOT 3 145 151 146 145 | |
#------------------------------------------------# | |
# actual data set will need to look like this 'yield_trials' | |
# HYBRID PLOT YIELD | |
# GARST P1 150 | |
# GARST P2 140 | |
# GARST P3 145 | |
# PIO P1 137 | |
# PIO P2 148 | |
# PIO P3 151 | |
# MYC P1 160 | |
# MYC P2 150 | |
# MYC P3 146 | |
# DEK P1 150 | |
# DEK P2 149 | |
# DEK P3 145 | |
yield_trials<-read.table("yield_trials.txt", header=TRUE) # read text file | |
print(yield_trials) # print data set | |
# *------------------------------------------------------------------* | |
# | 2 WAY AOV | |
# *------------------------------------------------------------------* | |
myModel<-aov(YIELD~HYBRID+PLOT,data=yield_trials) | |
summary(myModel) | |
anova(myModel) # should give same results as above | |
plot(myModel) | |
termplot(myModel) | |
#-------------------------------------------------- | |
# getting marginal/adjusted/least squares means from multivariable models | |
#-------------------------------------------------- | |
### example linear regression | |
summary(m1 <- lm(Post_ER_Admit ~ Control_Study_Flag + CaseOpenReasonDesc + Diag_Grouping + SourceFlag + Gender_Code + LowIncome + RuralZips + Pre_ER_Admit + AcutePre + Length_Stay_Pre + PrimaryCareVisitCount + Cancer_abnormal_cervical + Asthma + Pulmonary_Disease + HTN + Heart_Disease + Diabetes + Obesity + Depression + HIV + Other_Mental_Disorders + RiskLvl + OtherProgramFlag + Total_Bene + Total_TEPRV_ID + Total_Hospital_Beds + Total_Hospitals +BranchDesc + SponsorRankDesc + AgeCat, data = df_chrt)) | |
# b_trt = 1.03 more ER visits in the treatment group compared to .64 fewer in the matched analysis | |
# use emmeans package to calculate 'marginal means' or adjusted means | |
library(emmeans) | |
emmeans (m1, ~ Control_Study_Flag) # 4.81-3.77 = 1.04 which is ~ what we get directly from the regression above | |
### example negative binomial model | |
summary(m1 <- glm.nb(Post_ER_Admit ~ Control_Study_Flag + CaseOpenReasonDesc + Diag_Grouping + SourceFlag + Gender_Code + LowIncome + RuralZips +Pre_ER_Admit + AcutePre + Length_Stay_Pre + PrimaryCareVisitCount + Cancer_abnormal_cervical + Asthma + Pulmonary_Disease + HTN + Heart_Disease + Diabetes + Obesity + Depression +HIV + Other_Mental_Disorders + RiskLvl + OtherProgramFlag +Total_Bene + Total_TEPRV_ID + Total_Hospital_Beds + Total_Hospitals + BranchDesc + SponsorRankDesc + AgeCat, control = glm.control(maxit = 500),data = df_chrt)) | |
exp(0.184) # IRR = 1.202 treatment group has ~ 20% more ER visits than controls | |
m1.mmeans <- emmeans(m1, ~ Control_Study_Flag) # estimate marginal means | |
summary(m1.mmeans, infer = TRUE, type = 'response') # summary with backtransformation to reflect original scale | |
(.0634/.0527) # this gives a ratio of marginal means = 1.203 which is consistent with the exponentiated result directly from the model | |
# *-----------------------------------------------------------------* | |
# | | |
# | | |
# | | |
# | labels and formats | |
# | | |
# | | |
# | | |
# *------------------------------------------------------------------* | |
#-----------------------------------------------------------------# | |
# value labels or formats | |
#-----------------------------------------------------------------# | |
## | |
## example 1: recoding | |
## | |
# change location from 'numeric' into a 'factor' with character values | |
# this is not actually creating labels, bt actually changing the numeric values | |
# of LOCATION to more meaningful character values such as "N" for north and "S" for south | |
plots2$LOCATION <- factor(plots2$LOCATION,levels=c(1,2,3,4),labels=c("N", "S", "E", "W")) | |
plots2 | |
## | |
## example 2: creating new variables to act as labels for values | |
## | |
# lets create new variales to act as labels for insect pressure levels 1-5 | |
# instead of changing the numeric values for P1-P4, we are creating a new | |
# set of character variables to describe P1-P4 | |
myPlevels<-c(1,2,3,4,5) # set the # of levels | |
myPlabels <-c("Very Heavy","Heavy","Moderate","Light","Very Light") # labels | |
# now create a new set of variables that describe P1-P4 | |
plots2$P1f <- factor(plots2$P1, myPlevels, myPlabels) | |
plots2$P2f <- factor(plots2$P2, myPlevels, myPlabels) | |
plots2$P3f <- factor(plots2$P3, myPlevels, myPlabels) | |
plots2$P4f <- factor(plots2$P4, myPlevels, myPlabels) | |
plots2 | |
# Get summary and see that LOCATIONS are now counted. | |
summary( plots2[ c("P1f","P2f","P3f","P4f") ] ) | |
## | |
## note we could have just renamed these values just as we did for | |
## location in the first example | |
## | |
plots2b <- read.table("plots2.txt", header =TRUE) # copy of plots2 for this example | |
plots2b | |
# note we are re-using pre-defined labels from above so those statements | |
# are not repeated here | |
plots2b$P1 <- factor(plots2b$P1, myPlevels, myPlabels) | |
plots2b$P2 <- factor(plots2b$P2, myPlevels, myPlabels) | |
plots2b$P3 <- factor(plots2b$P3, myPlevels, myPlabels) | |
plots2b$P4 <- factor(plots2b$P4, myPlevels, myPlabels) | |
plots2b | |
#-----------------------------------------------------------# | |
# variable labels | |
#-----------------------------------------------------------# | |
# note in the examples above we were concerned with the formatting | |
# the values a particular variable could take, such as values N, S | |
# for the variable LOCATION or the value "Very Heavy" for the variables P1-P4 | |
# here we are looking at changing the name or appearance of the variables themselves | |
# vs the values they may take | |
library(Hmisc) | |
# start fresh with the origial unformatted data from plots2 | |
plots2<- read.table("plots2.txt", header =TRUE) | |
plots2 | |
# assign variable names to act as labels - lets assume we | |
# want vriables p1-p4 to appear as plot1 - plot4 | |
# this example 'pastes' new names over the old ones or | |
# simply changes the variable names | |
# note the reference [3:6] simply identifies the 3rd-6th variable names | |
# in the data frame for processing. | |
names(plots2)[3:6] <- c( | |
"PLOT 1", | |
"PLOT 2", | |
"PLOT 3", | |
"PLOT 4") | |
names(plots2) # verfy varable names have been changed | |
#-------------------------------------------------------# | |
# complete example for varaible name and value labels | |
#-------------------------------------------------------# | |
# utilize all of the methods above to re-format a data set | |
# suppose again we start off with a data set such as plots2 | |
# LOCATION TRAITS P1 P2 P3 P4 | |
# 1 RR 1 1 5 1 | |
# 2 RR 2 1 4 1 | |
# 1 RR 2 2 4 3 | |
# 2 RR 3 1 NA 3 | |
# 1 BT 4 5 2 4 | |
# 2 BT 5 4 5 5 | |
# 1 BT 5 3 4 4 | |
# 2 BT 4 5 5 5 | |
# start fresh with the origial unformatted data from plots2 | |
plots2<- read.table("plots2.txt", header =TRUE) | |
plots2 | |
# suppose we want the the values for the variable LOCATION to be N for LOCATON =1 and | |
# S for LOCATION= 2. We also want the values for the variabes P1-P4 to be | |
# 1 = "Very Heavy" 2 ="Heavy" 3 = "Moderate" 4 = "Light" 5 ="Very Light" | |
# and we want the variabe names P1-P4 to be "PLOT 1" - "PLOT 4" | |
# STEP 1: change values for variable LOCATION | |
plots2$LOCATION <- factor(plots2$LOCATION,levels=c(1,2,3,4),labels=c("N", "S", "E", "W")) | |
# STEP 2: change values for the variables P1-P4 | |
myPlevels<-c(1,2,3,4,5) # set the # of levels | |
myPlabels <-c("Very Heavy","Heavy","Moderate","Light","Very Light") # labels | |
plots2$P1 <- factor(plots2$P1, myPlevels, myPlabels) # note we are actally changing the values for P1-P4 | |
plots2$P2 <- factor(plots2$P2, myPlevels, myPlabels) # vs creating varables to describe the values | |
plots2$P3 <- factor(plots2$P3, myPlevels, myPlabels) | |
plots2$P4 <- factor(plots2$P4, myPlevels, myPlabels) | |
# STEP 3: change the VARIABLE labels for P1-P4 | |
names(plots2)[3:6] <- c( | |
"PLOT 1", | |
"PLOT 2", | |
"PLOT 3", | |
"PLOT 4") | |
# print re-formatted data set | |
plots2 | |
# functions | |
# create function myvar.port that iterates throught the weights and calculates portfolio variance | |
myvar.port <- function(nWeights) { | |
sigma.z <- matrix(c(0), nrow = nWeights) #initialize - we will compute 11 standard deviations | |
for (i in 1:nWeights){ | |
sigma.z <- sqrt((Wa)^2*(std.a)^2 + (Wb)^2*std.b^2 + 2*p.ab*Wa*Wb*std.a*std.b) | |
} | |
return (sigma.z) | |
} | |
# example: produce summary statistics for each category in a field | |
cats <- as.vector(tmp$Var1) # extract measure ids as a list | |
for(i in 1:19) { | |
print(cats[i]) | |
print(summary(complications$Score[complications$Measure_ID ==cats[i]])) | |
} | |
#-------------------------------------------------- | |
# dealing with scientific notation | |
#-------------------------------------------------- | |
x <- 1.82*(10^-7) | |
y <- format(x, scientific = FALSE) | |
# or even better: | |
options(scipen = 999) | |
#---------------------------------------------------- | |
# PROPENSITY SCORE MATCHING | |
#--------------------------------------------------- | |
###################################################### | |
# example from MatchIt documentation | |
##################################################### | |
# example data set is a subset of the job training program analyzed in Lalonde (1986) and Dehejia and Wahba (1999). | |
# MatchIt includes a subsample of the original data con- sisting of the National Supported Work Demonstration (NSW) | |
# treated group and the comparison sample from the Population Survey of Income Dynamics (PSID).1 The variables in this data | |
# set include participation in the job training program (treat, which is equal to 1 if participated in the program, and 0 otherwise), | |
# age (age), years of education (educ), race (black which is equal to 1 if black, and 0 otherwise; | |
# hispan which is equal to 1 if hispanic, and 0 otherwise), marital status (married, which is equal to 1 if married, 0 otherwise), | |
# high school degree (nodegree, which is equal to 1 if no degree, 0 otherwise), 1974 real earnings (re74), 1975 real earnings | |
# (re75), and the main outcome variable, 1978 real earnings (re78) | |
head(lalonde) | |
dim(lalonde) | |
names(lalonde) | |
# "treat" "age" "educ" "black" "hispan" "married" "nodegree" "re74" "re75" "re78" | |
#################################################### | |
# nearest neighbor matching | |
#################################################### | |
# Matching is done using a distance measure specified by the distance option (default=logit). | |
# Matches are chosen for each treated unit one at a time, with the order specified by the m.order command (default=largest to smallest). | |
# At each matching step we choose the control unit that is not yet matched but is closest to the treated unit on the distance measure. | |
m.out1 <- matchit(treat ~ re74 + re75 + age + educ, data = lalonde, method = "nearest", distance = "logit") | |
summary(m.out1) # check balance | |
m.data1 <- match.data(m.out1,distance ="pscore") # create ps matched data set | |
head(m.data1) # view | |
# perform paired t-tests (not in documentation) | |
t.test(m.data1$re78[m.data1$treat==1],m.data1$re78[m.data1$treat==0],paired=TRUE) | |
t.test(lalonde$re78[lalonde$treat==1],lalonde$re78[lalonde$treat==0],paired = FALSE) | |
# export data to compare in SAS or perform additional analysis | |
write.csv(lalonde, file = "//Documents//Briefcase///Data//lalonde.csv") | |
# ex | |
write.csv(w1, file = "r goals.csv") # assumes assigned directory or default directory | |
write.csv(m.data1, file = "//Documents//Briefcase///Data//lalonde_nearest.csv") | |
# ex without rownames | |
write.csv(opioid_dat, file = "//Documents//Briefcase///Data//opioid_dat.csv", row.names = FALSE) | |
#--------------------------------- | |
# working with dates | |
#-------------------------------- | |
# calculate difference in dates | |
# ex 1 | |
survey$date_diff <- as.Date(as.character(survey$date), format="%Y/%m/%d")-as.Date(as.character(survey$tx_start), format="%Y/%m/%d") | |
survey | |
# ex 2 | |
df$diff_in_days<- difftime(df$datevar1 ,df$datevar2 , units = c("days")) | |
# application | |
temp3_dates$diff_days <- abs((as.Date(as.character(temp3_dates$Measure_Start_Date), format="%m/%d/%Y")-as.Date(as.character(temp3_dates$Measure_End_Date), format="%m/%d/%Y"))) | |
# or | |
temp3_dates$diff_days <- abs((difftime(as.Date(as.character(temp3_dates$Measure_Start_Date), format="%m/%d/%Y"),as.Date(as.character(temp3_dates$Measure_End_Date), format="%m/%d/%Y") , units = c("days")))) | |
# create date format | |
df1$CovEffDate <- as.Date(df1$Coverage_Effective_Date, "%Y-%m-%d") | |
# sometimes you need to supply the origin if your data is numeric | |
as_datetime(y, origin = lubridate::origin) | |
# I have also seen | |
as.Date(17536,origin="1970-01-01") | |
# get min and max dates | |
tmp <- df1%>% | |
summarize(mindate = min(CovEffDate, na.rm=TRUE), | |
maxdate = max(CovEffDate, na.rm=TRUE)) | |
## first days of years | |
seq(as.Date("1910/1/1"), as.Date("1999/1/1"), "years") | |
## by month | |
seq(as.Date("2000/1/1"), by = "month", length.out = 12) | |
## quarters | |
seq(as.Date("2000/1/1"), as.Date("2003/1/1"), by = "quarter") | |
## find all 7th of the month between two dates, the last being a 7th. | |
st <- as.Date("1998-12-17") | |
en <- as.Date("2000-1-7") | |
ll <- seq(en, st, by = "-1 month") | |
rev(ll[ll > st & ll < en]) | |
# application: calculate age at index date for an observational study | |
df1$BirthDate <- as.character(df1$DOB) # format as character | |
df1$BirthDate <- as.Date(df1$BirthDate, "%Y-%m-%d") # format as date | |
df1$age <- round(difftime(df1$index_date,df1$BirthDate, units = c("days"))/(365.25)) # calculate age | |
df1$age <- as.numeric(df1$age) # make sure this is numeric | |
# calculate age using libridate function | |
df1$BirthDate <- as.character(df1$DOB) # format as character | |
df1$BirthDate <- mdy(df1$BirthDate) # apply date format (lubridate function) | |
df1$age <- round(difftime(df1$index_date,df1$BirthDate, units = c("days"))/(365.25)) # calculate age | |
df1$age <- as.numeric(df1$age) # make numeric | |
#---------------------------- | |
# sample training and validation | |
#--------------------------- | |
# store total number of observations in your data | |
N <- 400 | |
print(N) | |
# Number of training observations | |
Ntrain <- N * 0.5 | |
print(Ntrain) | |
# add an explicit row number variable for tracking | |
id <- seq(1,400) | |
apps2 <- cbind(apps,id) | |
# Randomly arrange the data and divide it into a training | |
# and test set. | |
dat <- apps2[sample(1:N),] | |
train <- dat[1:Ntrain,] | |
validate <- dat[(Ntrain+1):N,] | |
dim(dat) | |
dim(train) | |
dim(validate) | |
# sort and look at data sets to see that they are different | |
sort_train <- train[order(train$id),] | |
print(sort_train) | |
sort_val <- validate[order(validate$id),] | |
print(sort_val) | |
# random sample | |
randomSample = function(df,n) { | |
return (df[sample(nrow(df), n),]) | |
} | |
rs <- randomSample(tmp,5) | |
# random sample (with set.seed for repeatability) | |
randomSample = function(df,n) { | |
set.seed(123) | |
return (df[sample(nrow(df), n),]) | |
} | |
randomSample(tmp,5) # first call | |
randomSample(tmp,5) # second call | |
#----------------------------------- | |
# bootstrap | |
#----------------------------------- | |
# regression | |
summary(fit <- lm(GARST ~ PIO, data = yield_data)) | |
# boot strap for regression (this may not be accurate with such small sample) | |
bstrap <- c() | |
for (i in 1:10000) { | |
newsample <- yield_data[sample(nrow(yield_data), 5, replace = T), ] | |
bstrap <- c(bstrap, as.vector(coef(lm(GARST ~ PIO,newsample))[2])) } | |
hist(bstrap) | |
summary(bstrap) | |
sd(bstrap, na.rm = TRUE) # SE | |
quantile(bstrap, c(.025,.975),na.rm = TRUE) | |
#------------------------------------ | |
# transpose or reshape data | |
#------------------------------------ | |
# reference: https://stats.idre.ucla.edu/r/faq/how-can-i-reshape-my-data-in-r/ | |
# create example data | |
id <- c(1,1,1,2,2,2,3,3,3) | |
measure <- c("depth","temp","width","depth","temp","width","depth","temp","width") | |
values <- c(2,50,18,1.5,53,18,2.5,60,18) | |
dat <- data.frame(id,measure,values) | |
# transpose panel data to wide format | |
datwide <- reshape(dat, | |
timevar = "measure", # the panel variable(s) we want to be new columns | |
idvar = c("id"), # vars we want to keep constant | |
direction = "wide") | |
# collapse wide data into panel | |
datnarrow <- reshape(datwide, | |
varying = c("depth","temp","width"), # things we want to collapse into a single column | |
v.names = "value", # name for new column that will hold panel of values | |
timevar = "measure", # name for new column that collapses old variable names into values | |
times = c("depth","temp","width"), # old variable names that become new values | |
new.row.names = 1:1000, | |
direction = "long") | |
names(datwide) <- gsub("values.", "", names(datwide )) # cleanup names | |
#---------------------------- | |
# check duplicates | |
#--------------------------- | |
# check duplication | |
tmp <- data.frame(table(temp1_stars$Provider_ID)) | |
summary(tmp) | |
# remove duplicates or get distinct | |
DEK <- c(150,150,145,140,144,148,152,156,160,164) | |
PLOT <- c(1,1,1,1,5,6,7,8,9,10) | |
BT <- c('Y','Y','Y','N','N','N','Y','N','Y','Y') | |
dat <- data.frame(DEK,PLOT,BT) | |
# how many unique values of BT? | |
length(levels(dat$BT)) | |
# how many unique plot values | |
length(unique(dat$PLOT)) | |
# this data is duplicated in many ways | |
deduped.dat <- unique(dat[,1:3 ] ) # gets unique based on all 3 fields | |
# dplyr offers several options | |
deduped.dat <- dat%>%distinct(PLOT, .keep_all = TRUE) # based on PLOT | |
deduped.dat <- dat%>%distinct(PLOT,BT, .keep_all = TRUE) # based on PLOT and BT | |
deduped.dat <- dat%>%distinct(PLOT,BT,DEK, .keep_all = TRUE) # truly unique based on all listed fields | |
deduped.dat <- dat%>% distinct # same as above without listing fields related to duplication | |
# get distinct instance based on ID and pre term flag = 0 | |
tmp1 <- tmp1_births%>%arrange(MBR_ID,pre_term)%>%distinct(MBR_ID, .keep_all = TRUE) | |
tmp2 <- data.frame(table(tmp1$MBR_ID)) | |
tmp4 <- tmp1[tmp1$MBR_ID =='10060',] | |
# get distinct last instance based on ID and pre term flag = 1 | |
tmp1 <- tmp1_births%>%arrange(MBR_ID,desc(pre_term))%>%distinct(MBR_ID, .keep_all = TRUE) | |
tmp2 <- data.frame(table(tmp1$MBR_ID)) | |
tmp4 <- tmp1[tmp1$MBR_ID =='10060',] | |
# correcting plot boundaries | |
par(mar=c(1,1,1,1)) | |
# example routing checking and correcting duplicates based on key and 1 addiitonal field | |
# check duplicates | |
tmp1 <- data.frame(table(df2$MBR_ID)) # find duplicates | |
tmp <- df2[df2$MBR_ID =='10060',] # appears duplicated on diagnosis &/or claim # | |
# get distinct sorting to keep most recent diagnosis (judgement call) | |
tmp2 <- df2%>%arrange(MBR_ID,desc(SVC_FROM_DT))%>%distinct(MBR_ID, .keep_all = TRUE) | |
tmp3 <- tmp2[tmp2$MBR_ID =='10060',] # check | |
# update data frame | |
df2 <- tmp2 | |
rm(tmp,tmp1,tmp2,tmp3) # cleanup | |
# example referencing against a list that is a random sample of numeric IDs | |
# note: change formatting in 'vals' if character IDs are required | |
# check duplicates | |
tmp1 <- data.frame(table(df5$ID)) # find duplicates | |
tmp <- df5[df5$ID =='80050',] # appears duplicated on flag # | |
# check against a random sample of duplictes | |
rs <- randomSample(tmp1[tmp1$Freq > 1,],5) # look at random sample of duplicates | |
vals <- as.list((as.numeric(as.character(rs$Var1)))) # get ID as list from rs | |
tmp <- df5[df5$ID %in% vals,] # subset data with duplicated rows - check | |
# get distinct sorting to give priority to pre-term flag = 1 (judgement call) | |
tmp2 <- df5%>%arrange(ID,desc(pre_term))%>%distinct(ID, .keep_all = TRUE) | |
tmp3 <- tmp2[tmp2$ID %in% vals,] # check again - all pre_term = 1 obs should be kept | |
# update data frame | |
df5 <- tmp2 | |
rm(tmp,tmp1,tmp2,tmp3,rs,vals) # cleanup | |
#----------------------------- | |
# missing values | |
#----------------------------- | |
# count total missing values in a data frame | |
colSums(is.na(df1_chrt)) | |
### example recode missing category | |
df1$Lvl <- ifelse(is.na(df1$Level) == TRUE,'1-Low-Risk',df1$Lvl) | |
### example - categorizing with a missing value category | |
df1$riskcat <- ifelse(df1$Risk_Score <= 3,'1-Low', | |
ifelse(df1$Risk_Score > 3 & df1$Risk_Score <= 5, '2-Moderate', '3-High')) | |
# account for NA group | |
df1$riskcat <- ifelse(is.na(df1$Risk_Score) == TRUE,'0-NA',df1$riskcat) | |
# remove missing based on specified field | |
tmp3 <- tmp3[is.na(tmp3$RUCC_2013) == 'FALSE',] | |
tmp2 <- na.omit(tmp1) # listwise deletion of all missing | |
# basic imputation | |
impute.mean <- function(x) { | |
z <- mean(x, na.rm = TRUE) | |
x[is.na(x)] <- z | |
return(x) | |
} | |
apply(df$xvar,2,impute.mean) | |
impute.zero <- function(x) { | |
x[is.na(x)] <- 0 | |
return(x) | |
} | |
df1 <- data.frame(apply(df,2,impute.zero)) # this imputes all missing column values to zero | |
#-------------------------- | |
# loop processing | |
#------------------------- | |
# loop across fields and produce frequencies | |
results=list() # create list for storing results | |
# run loop and simulate data | |
for(i in 1:5) { | |
results[i] = sample(1:15, 1) | |
} | |
x <- as.numeric(as.character(unlist(results))) # convert results to object | |
summary(x) # analysis | |
#-------------------------------------- | |
# working with functins | |
#-------------------------------------- | |
apply(df,2,sd) # calculate standard deviation for every column in data set | |
df2 <- sapply(df1,as.numeric) # convert all columns to numeric | |
#----------------------------------- | |
# substring text and character operations | |
#----------------------------------- | |
# if ENDT still NULL and there is an coverage effective date then replace null end date with today's date | |
df1$ENDT <- ifelse((substr(df1$ENDT,1,1) =="N" & (substr(df1$DateVar,1,1) != "N")),as.character.Date("2018-10-14"),df1$ENDT) | |
# remove first character | |
df1$ID <- substring(df$ID, 2) | |
# remove last character | |
df1$ID <- substring(df1$ID,1,nchar(df1$ID)-1) # last quote | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment