Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Last active June 17, 2024 12:29
Show Gist options
  • Save BioSciEconomist/be83a5352c91c1e82a5d0846635fa483 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/be83a5352c91c1e82a5d0846635fa483 to your computer and use it in GitHub Desktop.
Data cleaning processing and analysis in R
# *-----------------------------------------------------------------
# | PROGRAM NAME: R Code Pallete
# | DATE: 3/11/20
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE:
# *----------------------------------------------------------------
#----------------------------------
# documentation templates
# ---------------------------------
# *-----------------------------------------------------------------
# | PROGRAM NAME:
# | DATE:
# | CREATED BY:
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE:
# *----------------------------------------------------------------
###############################################################################
###############################################################################
################# construction zone #############################
###############################################################################
###############################################################################
##
## new section
##
# *-----------------------------------------------------------------*
# |
# |
# |
# | reading data
# |
# |
# |
# |
# *------------------------------------------------------------------*
# identify location of data used for project by specifying a working directory
# alternatively, if data is stored in numerous places, you can reference
# the file locaton directly
rm(list=ls()) # get rid of any existing data
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation
cat("\f") # clear console
clc <- function() cat(rep("\n", 50))
rm("temp") # remove specific data frames
ls() # view open data sets (should be empty if you just ran the code above)
detach("package:vegan", unload=TRUE) # unload packages that we don't need or that mask other functions
# sometimes we may want to unload different versions of packages - the function below handles this +
# provides a specific example of how Matchit loads MASS which maskes dplyer
# function to detach MASS package which masks the 'select' operation within dplyr used for merging data
# we use dplyr extensively to process data but later in the analysis using the MASS package to estimate
# negative binomial models (for utilization counts like ER visits) it 'masks' or prevents 'select' from working
# this is a problem if we were previously using MASS (or Matchit which leverages MASS) in an analysis and then wanted to run this program
# so we want to 'unload' or 'detach' any versions of MASS which may be loaded before trying to build the data
# for the analysis below leveraging dplyr functions like 'select.' This is unnecessary if MASS has not been loaded
# see also: https://stackoverflow.com/questions/6979917/how-to-unload-a-package-without-restarting-r
detach_package <- function(pkg, character.only = FALSE)
{
if(!character.only)
{
pkg <- deparse(substitute(pkg))
}
search_item <- paste("package", pkg, sep = ":")
while(search_item %in% search())
{
detach(search_item, unload = TRUE, character.only = TRUE)
}
}
detach_package("MatchIt", TRUE) # first have to unload 'MatchIt' because it imports 'MASS'
detach_package("MASS", TRUE) # unload/detach MASS
# view specific columns
View(df1[,100:103])
str(df1) # variable names and formats
str(df1, list.len=ncol(df1)) # in case R truncates the output above
# set working directory
setwd('/Users/wkuuser/Desktop/R Data Sets') # mac
setwd("P:\\R Code References\\R Training") # windows
# *------------------------------------------------------------------*
# | make R talk on a macbook
# *------------------------------------------------------------------*
b <- sprintf("say Pumpkin pie is the best")
system(b, intern = FALSE, ignore.stdout = FALSE, ignore.stderr =
FALSE, wait = TRUE, input = NULL)
# *--------------------------------------------------------------------------------------------------*
# | direct reading of a single variable vector from command line- similar to cards statement in SAS
# *------------------------------------------------------------------------------------------------*
GARST <- c(150,140,145,137,141,145,149,153,157,161) # read in specified values for the variable GARST
print(GARST)
GARST <- c(150,140,145,137,141,145,149,153,157,161)
PIO <- c(160,150,146,138,142,146,150,154,158,162)
MYC <- c(137,148,151,139,143,120,115,136,130,129)
DEK <- c(150,149,145,140,144,148,152,156,160,164)
PLOT <- c(1,2,3,4,5,6,7,8,9,10)
BT <- c('Y','Y','N','N','N','N','Y','N','Y','Y')
RR <- c('Y','N','Y','N','N','N','N','Y','Y','N')
yield_data <- data.frame(GARST,PIO,MYC,DEK,PLOT,BT,RR)
rm(GARST,PIO,MYC,DEK,PLOT,BT,RR) # cleanup
# scatter plot
plot(yield_data$GARST,yield_data$PIO)
# this is even more like the cards statement in SAS
toread <- "id sex age inc r1 r2 r3
1 F 35 17 7 2 2
17 M 50 14 5 5 3
33 F 45 6 7 2 7
49 M 24 14 7 5 7
65 F 52 9 4 7 7
81 M 44 11 7 7 7
2 F 34 17 6 5 3
18 M 40 14 7 5 2
34 F 47 6 6 5 6
50 M 35 17 5 7 5"
survey <- read.table(textConnection(toread), header = TRUE)
closeAllConnections()
#--------------------------------------------------
# reading a file from the web
#---------------------------------------------------
library(data.table)
mydat <- fread('https://raw.githubusercontent.com/dincerti/dincerti.github.io/master/data/mepsdiab.csv')
head(mydat)
# *------------------------------------------------------------------*
# | reading data from a txt file
# *------------------------------------------------------------------*
# data looks like this:
# GARST PIO MYC DEK PLOT BT RR
# 150 160 137 150 1 Y Y
# 140 150 148 149 2 Y N
# 145 146 151 145 3 N Y
# 137 138 139 140 4 N N
# 141 142 143 144 5 N N
# 145 146 120 148 6 N N
# 149 150 115 152 7 Y N
# 153 154 136 156 8 N ?
# 157 158 130 160 9 Y Y
# 161 162 129 164 10 Y N
yield_data<-read.table("Yield_plots.txt", header=TRUE) # read text file
names(yield_data) # var list
print(yield_data) # print data set
yield_data # note: print() is not necessary to print
# get all factor variables
is.fact <- sapply(yield_data, is.factor)
tmp <- yield_data[,is.fact]
# *------------------------------------------------------------------*
# | reading data from a csv file
# *------------------------------------------------------------------*
# data looks like this
# HYBRID TRAIT P1 P2 P3
# GARST RR 150 140 145
# MYC BT 160 150 146
# PIO RR 137 148 151
# DEK BT 150 149 145
plots <- read.csv("plots.csv", na.strings=c(".", "NA", "", "?"), encoding="UTF-8")
plots
# create function to reformat column names from csv files (from:https://stackoverflow.com/questions/17152483/how-to-replace-the-in-column-names-generated-by-read-csv-with-a-single-spa )
makeColNamesUserFriendly <- function(ds) {
# FIXME: Repetitive.
# Convert any number of consecutive dots to a single space.
names(ds) <- gsub(x = names(ds),
pattern = "(\\.)+",
replacement = "_")
# Drop the trailing spaces.
names(ds) <- gsub(x = names(ds),
pattern = "( )+$",
replacement = "")
ds
}
# create unique ID for each line in data frame
tmp_claims$RowID <- 1:nrow(tmp_claims)
# convert index to column
names <- rownames(yield_data)
yield_data <- cbind(names,yield_data)
# repair read error for first ID (somethimes csv file read cause the first element in file to misread)
df$ID <- as.character(df$ID)
df[1,1] <- '123456'
# *------------------------------------------------------------------*
# | reading data from a excel xlsx
# *------------------------------------------------------------------*
xlsxFile <- "C:\\Users\\mtb2901\\Documents\\Data Science Projects\\Projects 2018\\201801 Hospital Risk Profile\\Raw Data\\TX-Hospital-Tricare-Network.xlsx"
# reference: https://www.rdocumentation.org/packages/openxlsx/versions/4.0.17/topics/read.xlsx
# read.xlsx(xlsxFile, sheet = 1, startRow = 1, colNames = TRUE,
# rowNames = FALSE, detectDates = FALSE, skipEmptyRows = TRUE,
# skipEmptyCols = TRUE, rows = NULL, cols = NULL, check.names = FALSE,
# namedRegion = NULL, na.strings = "NA", fillMergedCells = FALSE)
vars <- read.xlsx("FraudT3CB14jun18HEADER.xlsx",sheetIndex = 1) # read header file with variable names
#-------------------------------------
# writing files
#--------------------------------------
# ex without rownames
write.csv(opioid_dat, file = "//Documents//Briefcase///Data//opioid_dat.csv", row.names = FALSE)
# export as R data file
save(MBR_FACT_ANLY, file = "//projects//Data//MBR_FACT_ANLY.RData")
# export as text with tab dilemeter
write.table(df1, //Projects//Data//df1.txt",row.names = FALSE, sep="\t")
# *-----------------------------------------------------------------*
# |
# |
# |
# | printing and subsetting data
# |
# |
# |
# |
# *------------------------------------------------------------------*
# print selected variables
yield_data[c("GARST")]
# subset data via variable selection
my_hybrids <- yield_data[ c("GARST", "PIO")]
print(my_hybrids)
# subset based on a single variable
GARST<-yield_data[c("GARST")]
print(GARST)
df <- df[ , !names(df) %in% c("xvar")] # drop field
# print a subset of data based on observed variable values
print(yield_data[yield_data$GARST==150 & yield_data$PIO==160,])
# subset data based on observed values
high_yields <- yield_data [ yield_data$GARST==150 & yield_data$PIO==160,]
print(high_yields)
stacked_traits <-yield_data[ yield_data$BT =="Y" & yield_data$RR =="Y",]
stacked_traits
# subset based on just one variable value
LOW_GARST<-yield_data[GARST==140,]
print(LOW_GARST)
LOW_GARST<-yield_data[GARST < 150,]
print(LOW_GARST)
RR<- yield_data[yield_data$RR =="Y",]
RR
Bt<- yield_data[yield_data$BT=="Y",]
Bt
# subset based on %in%
vc <- c(150,145,195)
tmp <- yield_data[yield_data$GARST %in% vc,]
# or
tmp <- yield_data[yield_data$GARST %in% c(150,145,195,161),]
# this might also works using dplyr (which allows for additional manipulation if you want to
# 'pipe' in additional aggregations or calculations)
library(dplyr)
tmp <- yield_data %>%
filter(GARST %in% c(150,145,195))
# subset based on 'not' %in% using dplyr
tmp <- filter(yield_data,!BT %in% c('Y'))
library(sqldf)
tmp <- sqldf('select * from yield_data where GARST in(150,145,195)')
#------------------------------------------------
# connect to SQL server
#------------------------------------------------
# read initial analysis file
library(RODBC)
# connecct to the SQL Server LOUSQLWTS853
DB_Connection <- odbcDriverConnect('driver={SQL Server};server=arlsql123;database=ANLY_SBOX;trusted_connection=true')
# connect to specific table (see documentation to limit rows etc.)
MBR_FACT_ANLY <- sqlFetch(DB_Connection, "mbr_fact_anly")
# export as R data file for future use(so we don't have to hit SQL every time)
save(MBR_FACT_ANLY, file = "//projects//mtt94674//Data//MBR_FACT_ANLY.RData")
# make sure to close your database connection
close(DB_Connection)
# *-----------------------------------------------------------------*
# |
# |
# |
# | summary statistics
# |
# |
# |
# |
# *------------------------------------------------------------------*
summary(yield_data) # whole data set
summary(yield_data[c("GARST")]) # single variable
summary(yield_data$GARST) # single variable (more efficient)
summary(yield_data[c("GARST" , "PIO")]) # selected variables
summary(sqrt(yield_data$GARST) ) # transformation
# by processing
by(yield_data, yield_data$BT, summary)
# using dplyr
yield_data%>%
group_by(BT) %>% # group by s
summarize(mean = mean(GARST)) # mean
### more examples with dplyr
# how many members per risk group (members are duplicated by transactions)
# as such the same member can have more than one transaction scored by the model
# that are in the same risk group and a member can belong to multiple
# risk groups if they have multiple transactions scored accordingly
### brute force example
tmp <- df4%>%filter(riskcat == '1:LOW')%>%distinct(mbrid)
tmp <- df4%>%filter(riskcat == '2:MOD')%>%distinct(mbrid)
tmp <- df4%>%filter(riskcat == '3:HI')%>%distinct(mbrid)
tmp <- df4%>%filter(riskcat == '4:VHI')%>%distinct(mbrid)
### a little less brute force
tmp1 <- df4%>%select(riskcat,mbrid)%>%arrange(riskcat,mbrid) # sort by risk and ID
tmp2 <-tmp1%>%group_by(riskcat)%>%distinct(mbrid) # get unique IDs per risk strata
# how many members per strata
tmp2%>%
group_by(riskcat)%>%
summarize(total = n())
### can we do the things above in one step
df4%>%select(riskcat,mbrid)%>%arrange(riskcat,mbrid)%>%group_by(riskcat)%>%distinct(mbrid)%>%
summarize(total = n())
library(Hmisc)
# frequencies
library(Hmisc)
describe(yield_data)
# contents of data set - (Hmisc)
contents(yield_data)
# aggregations and crosstabulations
summary(ACTIV_PARTICIP ~ AGECAT, data=hc_mbr_anly, fun=mean)
# tables
tmp <- data.frame(table(complications$Measure_ID)) # saved as an easily readable data frame
aggregate(ACTIV_PARTICIP~RUCC_2013_DESC,hc_mbr_anly,mean)
CROP <- c('Cotton','Cotton','Corn','Corn','Corn','SB','SB','SB','SB','SB')
RR <- c('Y','N','Y','N','N','N','N','Y','Y','N')
yield_data <- data.frame(CROP,RR)
rm(CROP,RR) # cleanup
mytable <-table(yield_data$RR,yield_data$CROP)
tmp <- data.frame(mytable)
names(tmp) <- c("RR","Crop","Total")
print(tmp)
tmp <- data.frame(prop.table(mytable)) # cell percentages
names(tmp) <- c("RR","Crop","Pct")
print(tmp)
tmp <- data.frame(prop.table(mytable, 1)) # row percentages (%Crop within RR status)
names(tmp) <- c("RR","Crop","RowPct")
print(tmp)
tmp <- data.frame(prop.table(mytable, 2)) # column percentages (%RR within Crop)
names(tmp) <- c("RR","Crop","ColPct")
print(tmp)
# cross tabulations across a range of variables
# average garst across BT and RR
is.fact <- sapply(yield_data, is.factor)
tmp <- yield_data[,is.fact]
vars <- names(tmp)
for(i in 1:2) {
print("+-+-+-+-+-+-+-+-+-+-+-+-")
print(vars[i])
print("+-+-+-+-+-+-+-+-+-+-+-+-")
print(by(yield_data$GARST, yield_data[c(vars[i])],mean))
}
# cross tablulations across a range of columns in a data frame
names(sb) # list columns for reference
xvar <- as.list(names(sb)) # save names as a list to reference in loop
# loop across fields and produce frequencies
for(i in 1:36) {
print(xvar[i])
print(data.frame(table(sb[c(paste(xvar[i]))])))
}
aggregate(totmthcov~pre_term,df1,mean)
# using dplyr
df1%>%
summarize(mean = mean(prob,na.rm = TRUE),
min = min(prob,na.rm = TRUE),
Q1=quantile (prob, probs=0.25,na.rm = TRUE),
med = median(prob,na.rm = TRUE),
Q3=quantile(prob, probs=0.75,na.rm = TRUE),
P85=quantile(prob, probs=0.85,na.rm = TRUE),
P90=quantile(prob, probs=0.90,na.rm = TRUE),
P95=quantile(prob, probs=0.95,na.rm = TRUE),
P99=quantile(prob, probs=0.99,na.rm = TRUE))
# aggregate totals per person
df5 <- df4%>%
group_by(ID) %>%
summarize(TOTAL_ALLOW_PRE = sum(TOT_ALLOWED_AMT))
xtabs(~ BT, data = yield_data) # counts of types of BT
xtabs(~ BT + RR, data = yield_data) # BT by RR
xtabs(GARST ~ RR, data=yield_data) # total RR for GARST
# categorical frequencies by treatment status
mytable <- table(df1$PrimaryDiagDesc,df1$treat)
prop.table(mytable, 2) # column percentage
# *-----------------------------------------------------------------*
# |
# |
# |
# | creating and adding new variables
# |
# |
# |
# |
# *------------------------------------------------------------------*
# create a new data set that includes a new variable that is
# the sum of "garst" and "pio"
yield_data2<-transform(yield_data, sum=GARST+PIO)
print (yield_data2)
# add a new variable to existing data set that is
# the sum of all hybrids
#first create data set HYBRIDS
hybrids<-yield_data
print(hybrids)
# add new variable to hybrids
hybrids<-transform(hybrids, SUM=GARST+PIO+MYC+DEK)
print(hybrids)
# or add 2 new variables
hybrids<-transform(hybrids, SUM=GARST+PIO+MYC+DEK,
MEAN=SUM/4)
print(hybrids)
# combine or concatenating two columns
yield_data$x <- paste(yield_data$RR,yield_data$BT)
yield_data$x <- paste(yield_data$RR, "-",yield_data$BT)
df$x <- paste(df$n,"-",df$s) for inserting a separator.
# *-----------------------------------------------------------------*
# |
# |
# |
# | conditional processing
# |
# |
# |
# |
# *------------------------------------------------------------------*
# recall from the data set there are 10 plots
# first assume that in plot 3 we accidentally planted a Bt
# garst hybrid that was supposed to be a non-Bt hybrid
# so we don't want to include it in the non Bt
# average for hybrids- we can use 'ifelse' conditions
# to exclude this from the mean calculation
# first create data set trials
trials<-yield_data
print(trials)
print(trials$PLOT)
trials$AVG <- ifelse ( trials$PLOT == 3 ,
(trials$AVG <- (trials$PIO+trials$MYC+trials$DEK)/3), # excludes GARST in the calculation if PLOT = 3
(trials$AVG <- (trials$GARST+trials$PIO+trials$MYC+trials$DEK)/4 ) ) # for all other plots
# creates new variable AVG in data set trials
trials
# another example - notice the nexting of the ifelse conditions:
grades$letter <- ifelse(grades$score >= .90,"A", ifelse(grades$score >= .80 & grades$score < .90, "B",ifelse (grades$score >= .70 & grades$score < .80,"C",ifelse(grades$score >= .60 &
grades$score < .70, "D","F"))))
table(grades$letter)
# create binary target
banking$target <- ifelse(banking$y =="no",0,1)
# model calibration / risk stratification
df1$risk <- ifelse(df1$prob < .12,"1:LOW",
ifelse(df1$prob >= .12 & df1$prob < .30, "2:MOD",
ifelse(df1$prob >= .3 & df1$prob < .60, "3:HI","4:VHI")))
table(df1$risk)
# age categorization
df1$AgeCat <- ifelse(df1$age < 18,'Under 18',
ifelse(df1$age >= 18 & df1$age <= 25,'18-25',
ifelse(df1$age > 25 & df1$age <= 29, '26-29',
ifelse(df1$age > 29 & df1$age <= 39,'30-39',
ifelse(df1$age > 39 & df1$age <= 49, '40-49',
ifelse(df1$age > 49 & df1$age <= 59, '50-59',
ifelse(df1$age >59 & df1$age <= 64, '60-64','65-65+')))))))
# *-----------------------------------------------------------------*
# |
# |
# |
# | stacking / concatenating/ adding data sets
# |
# |
# |
# |
# *------------------------------------------------------------------*
# get only bt hybrids- same as subsetting
BT_HYBRIDS <- yield_data[yield_data$BT=="Y", ]
print(BT_HYBRIDS)
# get only non bt hybrids
NON_BT_HYBRIDS<- yield_data[yield_data$BT=="N",]
print(NON_BT_HYBRIDS)
# *------------------------------------------------------------------*
# | combine with the rbind function
# *------------------------------------------------------------------*
both<-rbind(BT_HYBRIDS,NON_BT_HYBRIDS)
print(both)
# *------------------------------------------------------------------*
# | stack with SQL using sqldf
# *------------------------------------------------------------------*
library(sqldf)
library(ggplot2)
both_sql = sqldf('
select *
from BT_HYBRIDS union all select * from NON_BT_HYBRIDS
order by PLOT
')
# note: the 'all' statement keeps all rows, if you omit 'all' you get only unique rows
both_sql
# *-----------------------------------------------------------------*
# |
# |
# |
# |
# | merging based on common variables
# |
# |
# |
# *------------------------------------------------------------------*
# first split the data, one set contains plot, bt, rr, garst, pio
# the other set has plot, rr, bt, myc, dek
# we'll work under the assumption that the first set is 'early'
# hybrids and the second set is 'late' hybrids
# as if we collected data on these separately and want to combine the
# data set to get one set, like we started with with yield_data
# create early / late planting data set
early<-yield_data[c("PLOT","GARST","PIO","BT","RR")]
early
late<-yield_data[c("PLOT","MYC","DEK","BT","RR")]
late
# *------------------------------------------------------------------*
# | merge by common variables- PLOT, BT, RR
# *------------------------------------------------------------------*
hybrids<-merge(early,late,by=c("PLOT","BT","RR"))
hybrids
# *------------------------------------------------------------------*
# | merging by a single common variable
# *------------------------------------------------------------------*
# the above merges back to the orignianl data set, but
# we had to be careful to merge by ALL of the common variables
# suppose we had the following data set strip_trial.txt
# HYBRID YIELD TRAIT
# GARST 150 BT
# MYC 140 RR
# PIO 160 RR
# DEK 145 BT
# lets read it in and split it up to form 2 data sets 'yields'
# and 'traits' with a SINGLE common variable 'HYBRID' to merge by
#'yields'
# HYBRID YIELD
# GARST 150
# MYC 140
# PIO 160
# DEK 145
# 'traits'
# HYBRID TRAIT
# GARST BT
# MYC RR
# PIO RR
# DEK BT
# read data 'yields'
yields <- read.table("yields.txt", header=TRUE) # read text file
yields
# read data 'traits'
traits <- read.table("traits.txt", header=TRUE) # read text file
traits
# now to demonstate merging data sets by a common variable
field_data<-merge(yields,traits,by=c("HYBRID"))
field_data
# *------------------------------------------------------------------*
# | merging with SQL using sqldf
# *------------------------------------------------------------------*
# library(sqldf)
# library(ggplot2)
# left join yields and traits on HYBRID
field_data_sql = sqldf('
select a.HYBRID,a.YIELD,b.TRAIT
from yields a left join traits b
on a.HYBRID =b.HYBRID
order by a.YIELD')
field_data_sql
# left join early and late (from above) on PLOT- note for each plot RR and BT are the
# same regardless of the hybrid so specifying BT and RR for only one data set will suffice,
# but we need to join on PLOT this seems to give a much more straightforward solution to the
# merge above using the R merge statement
hybrids_sql = sqldf('
select a.PLOT,a.GARST,a.PIO,
b.MYC, b.DEK, a.BT, a.RR
from early a left join late b
on a.PLOT =b.PLOT
order by a.PLOT')
hybrids_sql
#-----------------------------------------------------
# merging using dplyr
#----------------------------------------------------
tmp3 <- tmp2%>% left_join(tmp1, by = "State")
# get all months between EFFMTH and ENDMTH in cohort history file
monthlist <- df1%>%left_join(select(tmp5,MBRID,Month,covmth),
by = "MBRID")%>%filter(Month >= EFFMTH & Month <= ENDMTH)
monthlist <- monthlist %>%arrange(MBRID,Month)
# more complicated join wiht dplyr
# merge claims data with cohort file keeping only claims between the coverage periods
df3 <- df2%>%select(MBRID,EFFMTH,ENDMTH,INDEX_MTH_DT,pre12,post12,claimflag)%>%left_join(select(raw_claims,MBRID = PID,Claim_Number,SVC_FROM_DT,TOT_ALLOWED_AMT),
by = "DMBRID")%>%filter(SVC_FROM_DT >= EFFMTH & SVC_FROM_DT <= ENDMTH)
# *-----------------------------------------------------------------*
# |
# |
# |
# | sorting data
# |
# |
# |
# *------------------------------------------------------------------*
# use file plots2 weed pressure
# LOCATION TRAITS P1 P2 P3 P4
# 1 RR 1 1 5 1
# 2 RR 2 1 4 1
# 1 RR 2 2 4 3
# 2 RR 3 1 NA 3
# 1 BT 4 5 2 4
# 2 BT 5 4 5 5
# 1 BT 5 3 4 4
# 2 BT 4 5 5 5
plots2<- read.table("plots2.txt", header =TRUE)
plots2
# sort data by location
plotsSorted<-plots2[order(plots2$LOCATION),]
plots2
plotsSorted
# sort by trait then location
plots2Sorted<-plots2[order(plots2$TRAITS, plots2$LOCATION),]
plots2Sorted
# for descending order, prefix any variable with a minus sign
plots2Sorted<-plots2[order(-plots2$LOCATION,plots2$TRAITS),]
plots2Sorted
# dplyr
tmp <- arrange(tmp,desc(Freq))
# *-----------------------------------------------------------------*
# |
# |
# |
# | graphics
# |
# |
# |
# *------------------------------------------------------------------*
# *------------------------------------------------------------------*
# | examples of r graphics capabilities
# *------------------------------------------------------------------*
demo(graphics)
demo(persp)
library(lattice)
demo(lattice)
# *------------------------------------------------------------------*
# | basic plots
# *------------------------------------------------------------------*
# histogram
hist(yield_data$GARST)
# bar plot
barplot(yield_data$GARST, main="Garst Yields", xlab ="Plots",col =c("darkblue","red"), legend =rownames(yield_data$PLOT))
# bar plot (example)
counts <- mytable$Freq[mytable$Freq > 8]
barplot(counts, main="Conditions of Members",
xlab="Conditions", col=c("darkblue","red"),
legend = mytable$Var1[mytable$Freq > 8], beside=False)
# using xtabs to create plots
plot(~xtabs(GARST ~ RR, data=yield_data)) # barplot
plot(xtabs(~ BT, data = yield_data)) # barplot
plot(xtabs(~ BT + RR, data = yield_data)) # mosaic plot
# scatter plot
plot(yield_data$GARST,yield_data$PIO)
# scatterplot matrix of whole data set
plot(yield_data)
# scatterplot matrix- data used in regression in analysis section
land_prices<- read.table("land_prices.txt", header =TRUE)
land_prices
plot(land_prices)
# brushed scatterplot - as in model visualization
ggplot(df1,aes(x=SUMRX_NONLTOT_QB_GE120MEDD,y=SUMRX_LTOT_QB_GE120MEDD, shape = risk,color = risk)) +
scale_color_manual(values=c("green","yellow", "red","red")) + geom_point()
# labels + jittering
set.seed(456)
ggplot(df1,aes(x=SUMRX_NONLTOT_QB_GE120MEDD,y=SUMRX_LTOT_QB_GE120MEDD, shape = risk,color = risk)) +
scale_color_manual(values=c("green","yellow", "red","red")) + geom_point() + geom_jitter(width = 2, height =2) +
xlab("Total Non-Qualified >= 120") +
ylab("Total Qualified >= 120") +
ggtitle("Observed Rx Counts Overlaid by Predicted Risk/Stratification")
# *------------------------------------------------------------------*
# | stacked bar chart
# *------------------------------------------------------------------*
# get data
bytrait<- read.table("biotech_yield.txt", header =TRUE)
bytrait
names(bytrait)
# summarize data for stacking by trait
counts <- table(bytrait$RR, bytrait$YIELD) # list stacked variable first
counts
barplot(counts, main ="Yield by Trait for Garst",xlab ="Yield Category", col =c("darkblue","red"),legend =rownames(counts))
#--------------------------------------------
# pie and bar charts in ggplot
#--------------------------------------------
table(df$pre_term) # get values by group
tmp <- data.frame(
group = c("Full-Term", "Pre-Term"),
value = c(7999,6000)
)
# stacked bar
bp <- ggplot(tmp, aes(x="", y=value, fill=group))+
geom_bar(width = 1, stat = "identity")
bp
# pie
pie <- bp + coord_polar("y", start=0)
pie
pie + scale_fill_manual(values=c("red","blue"))
# *------------------------------------------------------------------*
# | basic density plots
# *------------------------------------------------------------------*
# plot and save histogram data
garsths <- hist(yield_data$GARST, main = "Garst Yield Histogram")
print(garsths)
# compute density estimates using default gaussian & rule of thumb bandwidth
garstdens <- density(x=yield_data$GARST, bw="nrd0",kernel="gaussian",n=20)
print(garstdens)
# rescale density so it will plot on the same graph as GARST histogram
rs <- max(garsths$counts/max(garstdens$y)) # create scaling factor for plotting the density
lines(garstdens$x, garstdens$y*rs, col=2) # plot density over histogram using lines statment
# compare this to just plotting the non-rescaled density over the histogram
lines(density(x=yield_data$GARST, bw="nrd0", n=20),col=3) # green line barely shows up
# turn off/close graphics window
dev.off()
# *------------------------------------------------------------------*
# | overlapping density plots
# *------------------------------------------------------------------*
library(colorspace) # package for rainbow_hcl function
# Generate just the data for a histogram of GARST grouping by RR trait
ds <- rbind(data.frame(dat=yield_data[,][,"GARST"], grp="All"),
data.frame(dat=yield_data[,][yield_data$RR=="N","GARST"], grp="N"),
data.frame(dat=yield_data[,][yield_data$RR=="Y","GARST"], grp="Y"))
# histogram for all GARST data
hs <- hist(ds[ds$grp=="All",1], main="", xlab="GARST", col="grey90", ylim=c(0, 9.46395633979357), breaks="fd", border=TRUE)
# compute density, rescale, and plot for all GARST data
dens <- density(ds[ds$grp=="All",1], na.rm=TRUE)
rs <- max(hs$counts)/max(dens$y)
lines(dens$x, dens$y*rs, type="l", col=rainbow_hcl(3)[1])
# compute density, rescale, and plot where RR = 'N'
dens <- density(ds[ds$grp=="N",1], na.rm=TRUE)
rs <- max(hs$counts)/max(dens$y)
lines(dens$x, dens$y*rs, type="l", col=rainbow_hcl(3)[2])
# compute density, rescale, and plot where RR = 'Y'
dens <- density(ds[ds$grp=="Y",1], na.rm=TRUE)
rs <- max(hs$counts)/max(dens$y)
lines(dens$x, dens$y*rs, type="l", col=rainbow_hcl(3)[3])
# Add a rug to illustrate density
rug(ds[ds$grp=="N", 1], col=rainbow_hcl(3)[2])
rug(ds[ds$grp=="Y", 1], col=rainbow_hcl(3)[3])
# Add a legend to the plot.
legend("topright", c("All", "N", "Y"), bty="n", fill=rainbow_hcl(3))
# Add a title to the plot.
title(main="Distribution of GARST
by RR", sub=paste("Created Using R Statistical Package", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
### pie chart
table(tmp$Enrollee) # get counts
slices <- c(3111,14141) # define slices
lbls <- c("True","False") # slice labels
pct <- round(slices/sum(slices)*100,digits =2) # calc pct
lbls <- paste(lbls, pct) # add calculated percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(slices,labels = lbls, col=rainbow(length(lbls)),
main="Members")
### aggregate time series and produce multiple line chart (dply & ggplot2)
# subset only top 5 departmetns for OT Hrs
tmp2<-
filter(df2,Role=="Specialist"|Role=="Finder"|Role=="Customer Care Specialist"|Role =="Clinical"|Role=="Content Specialist")
# aggregate by department and date
tmp3 <- tmp2%>%
group_by(Role,End_Date) %>%
summarize(TotalOTHrs = sum(Overtime_Hours))
ggplot(tmp3,aes(End_Date,TotalOTHrs,colour=Role)) + geom_line() + geom_point()
### aggregate and simple plot
# aggregate & visualize total OT by pay period
tmp <- df2%>%
group_by(End_Date) %>%
summarize(TotalOTHrs = sum(Overtime_Hours))
plot(tmp$End_Date,tmp$TotalOTHrs,type="b", xlab = "End Date", ylab = "Total OT Hrs")
#------------------------------------------------
# side by side plots
#------------------------------------------------
par(mfrow=c(1,2)) # set the plotting area into a 1*2 array
hist(treat,breaks = 50)
hist(ctrl, breaks = 50)
# *-----------------------------------------------------------------*
# |
# |
# |
# | analysis
# |
# |
# |
# *------------------------------------------------------------------*
# *------------------------------------------------------------------*
# | t-tests
# *------------------------------------------------------------------*
# independent samples t-test
t.test(yield_data$GARST,yield_data$PIO ) # difference in GARST vs PIO yields from yield_data file
t.test(plots2$P4[plots2$TRAITS=='RR'],plots2$P4[plots2$TRAITS=='BT'] ) # difference in weed pressure for bt and rr traits in plot 4
# paried t-test
t.test(plots2$P1,plots2$P2,paired=TRUE) # differences in overall weed pressure for plots 1 and 2 (more appropriately used on same subjects- before and after tests)
# example if plot 1 was before herbicide application while plot 2 was the same plot measured for weed
pressure after application
t.test(yield_data$GARST,yield_data$PIO,paired=TRUE) # not actually appropriate but notice the difference in the results from the previous t-test on
# GARST and PIO
# *------------------------------------------------------------------*
# | measures of association
# *------------------------------------------------------------------*
# pearson correlations - no sig tests
rcorr(cbind(plots2$P1,plots2$P2,plots2$P3,plots2$P4)) # gives pearson correlations
# the cor.test has p-value, ci, but only 2 vars at a time
cor.test(plots2$P1,plots2$P2,use="pairwise")
# spearman correlations using the hmisc rcorr function
rcorr( cbind(plots2$P1,plots2$P2,plots2$P3,plots2$P4), type='spearman')
rcorr( cbind(plots2$P1,plots2$P2,plots2$P3,plots2$P4), type='pearson') # compare, also same as defalt
# *------------------------------------------------------------------*
# | simple linear regression
# *------------------------------------------------------------------*
# lets use another data set land_prices
# PRICE ACRES IMPROVEMENTS
# 36 9 8
# 80 15 7
# 44 10 9
# 55 11 10
# 35 10 6
land_prices<- read.table("land_prices.txt", header =TRUE)
land_prices
plot(land_prices) # scatterplot of dependent and independent variables
myRegModel<-lm(PRICE~ACRES,data=land_prices) # run regression
summary(myRegModel) # view estimates
anova(myRegModel)
# simple plots- plot data and regresion line
plot(land_prices$ACRES,land_prices$PRICE)
abline(myRegModel)
# automated plots
plot(myRegModel)
termplot(myRegModel)
# *------------------------------------------------------------------*
# | multipe linear regression
# *------------------------------------------------------------------*
myRegModel<-lm(PRICE~ACRES+IMPROVEMENTS,data=land_prices) # run regression
summary(myRegModel) # view estimates
anova(myRegModel)
plot(myRegModel)
termplot(myRegModel)
# *------------------------------------------------------------------*
# | analysis of variance
# *------------------------------------------------------------------*
# look at data set 'yield_data'
yield_data<-read.table("Yield_plots.txt", header=TRUE) # read text file
print(yield_data) # print data set
# data looks like this:
# GARST PIO MYC DEK PLOT BT RR
# 150 160 137 150 1 Y Y
# 140 150 148 149 2 Y N
# 145 146 151 145 3 N Y
# 137 138 139 140 4 N N
# 141 142 143 144 5 N N
# 145 146 120 148 6 N N
# 149 150 115 152 7 Y N
# 153 154 136 156 8 N ?
# 157 158 130 160 9 Y Y
# 161 162 129 164 10 Y N
# we would like to set up a design such as this:
# let treatments = hybrid, blocks = plot
#------------------------------------------------#
# HYBRID
# GARST PIO MYC DEK
# PLOT 1 150 137 160 150
# PLOT 2 140 148 150 149
# PLOT 3 145 151 146 145
#------------------------------------------------#
# actual data set will need to look like this 'yield_trials'
# HYBRID PLOT YIELD
# GARST P1 150
# GARST P2 140
# GARST P3 145
# PIO P1 137
# PIO P2 148
# PIO P3 151
# MYC P1 160
# MYC P2 150
# MYC P3 146
# DEK P1 150
# DEK P2 149
# DEK P3 145
yield_trials<-read.table("yield_trials.txt", header=TRUE) # read text file
print(yield_trials) # print data set
# *------------------------------------------------------------------*
# | 2 WAY AOV
# *------------------------------------------------------------------*
myModel<-aov(YIELD~HYBRID+PLOT,data=yield_trials)
summary(myModel)
anova(myModel) # should give same results as above
plot(myModel)
termplot(myModel)
#--------------------------------------------------
# getting marginal/adjusted/least squares means from multivariable models
#--------------------------------------------------
### example linear regression
summary(m1 <- lm(Post_ER_Admit ~ Control_Study_Flag + CaseOpenReasonDesc + Diag_Grouping + SourceFlag + Gender_Code + LowIncome + RuralZips + Pre_ER_Admit + AcutePre + Length_Stay_Pre + PrimaryCareVisitCount + Cancer_abnormal_cervical + Asthma + Pulmonary_Disease + HTN + Heart_Disease + Diabetes + Obesity + Depression + HIV + Other_Mental_Disorders + RiskLvl + OtherProgramFlag + Total_Bene + Total_TEPRV_ID + Total_Hospital_Beds + Total_Hospitals +BranchDesc + SponsorRankDesc + AgeCat, data = df_chrt))
# b_trt = 1.03 more ER visits in the treatment group compared to .64 fewer in the matched analysis
# use emmeans package to calculate 'marginal means' or adjusted means
library(emmeans)
emmeans (m1, ~ Control_Study_Flag) # 4.81-3.77 = 1.04 which is ~ what we get directly from the regression above
### example negative binomial model
summary(m1 <- glm.nb(Post_ER_Admit ~ Control_Study_Flag + CaseOpenReasonDesc + Diag_Grouping + SourceFlag + Gender_Code + LowIncome + RuralZips +Pre_ER_Admit + AcutePre + Length_Stay_Pre + PrimaryCareVisitCount + Cancer_abnormal_cervical + Asthma + Pulmonary_Disease + HTN + Heart_Disease + Diabetes + Obesity + Depression +HIV + Other_Mental_Disorders + RiskLvl + OtherProgramFlag +Total_Bene + Total_TEPRV_ID + Total_Hospital_Beds + Total_Hospitals + BranchDesc + SponsorRankDesc + AgeCat, control = glm.control(maxit = 500),data = df_chrt))
exp(0.184) # IRR = 1.202 treatment group has ~ 20% more ER visits than controls
m1.mmeans <- emmeans(m1, ~ Control_Study_Flag) # estimate marginal means
summary(m1.mmeans, infer = TRUE, type = 'response') # summary with backtransformation to reflect original scale
(.0634/.0527) # this gives a ratio of marginal means = 1.203 which is consistent with the exponentiated result directly from the model
# *-----------------------------------------------------------------*
# |
# |
# |
# | labels and formats
# |
# |
# |
# *------------------------------------------------------------------*
#-----------------------------------------------------------------#
# value labels or formats
#-----------------------------------------------------------------#
##
## example 1: recoding
##
# change location from 'numeric' into a 'factor' with character values
# this is not actually creating labels, bt actually changing the numeric values
# of LOCATION to more meaningful character values such as "N" for north and "S" for south
plots2$LOCATION <- factor(plots2$LOCATION,levels=c(1,2,3,4),labels=c("N", "S", "E", "W"))
plots2
##
## example 2: creating new variables to act as labels for values
##
# lets create new variales to act as labels for insect pressure levels 1-5
# instead of changing the numeric values for P1-P4, we are creating a new
# set of character variables to describe P1-P4
myPlevels<-c(1,2,3,4,5) # set the # of levels
myPlabels <-c("Very Heavy","Heavy","Moderate","Light","Very Light") # labels
# now create a new set of variables that describe P1-P4
plots2$P1f <- factor(plots2$P1, myPlevels, myPlabels)
plots2$P2f <- factor(plots2$P2, myPlevels, myPlabels)
plots2$P3f <- factor(plots2$P3, myPlevels, myPlabels)
plots2$P4f <- factor(plots2$P4, myPlevels, myPlabels)
plots2
# Get summary and see that LOCATIONS are now counted.
summary( plots2[ c("P1f","P2f","P3f","P4f") ] )
##
## note we could have just renamed these values just as we did for
## location in the first example
##
plots2b <- read.table("plots2.txt", header =TRUE) # copy of plots2 for this example
plots2b
# note we are re-using pre-defined labels from above so those statements
# are not repeated here
plots2b$P1 <- factor(plots2b$P1, myPlevels, myPlabels)
plots2b$P2 <- factor(plots2b$P2, myPlevels, myPlabels)
plots2b$P3 <- factor(plots2b$P3, myPlevels, myPlabels)
plots2b$P4 <- factor(plots2b$P4, myPlevels, myPlabels)
plots2b
#-----------------------------------------------------------#
# variable labels
#-----------------------------------------------------------#
# note in the examples above we were concerned with the formatting
# the values a particular variable could take, such as values N, S
# for the variable LOCATION or the value "Very Heavy" for the variables P1-P4
# here we are looking at changing the name or appearance of the variables themselves
# vs the values they may take
library(Hmisc)
# start fresh with the origial unformatted data from plots2
plots2<- read.table("plots2.txt", header =TRUE)
plots2
# assign variable names to act as labels - lets assume we
# want vriables p1-p4 to appear as plot1 - plot4
# this example 'pastes' new names over the old ones or
# simply changes the variable names
# note the reference [3:6] simply identifies the 3rd-6th variable names
# in the data frame for processing.
names(plots2)[3:6] <- c(
"PLOT 1",
"PLOT 2",
"PLOT 3",
"PLOT 4")
names(plots2) # verfy varable names have been changed
#-------------------------------------------------------#
# complete example for varaible name and value labels
#-------------------------------------------------------#
# utilize all of the methods above to re-format a data set
# suppose again we start off with a data set such as plots2
# LOCATION TRAITS P1 P2 P3 P4
# 1 RR 1 1 5 1
# 2 RR 2 1 4 1
# 1 RR 2 2 4 3
# 2 RR 3 1 NA 3
# 1 BT 4 5 2 4
# 2 BT 5 4 5 5
# 1 BT 5 3 4 4
# 2 BT 4 5 5 5
# start fresh with the origial unformatted data from plots2
plots2<- read.table("plots2.txt", header =TRUE)
plots2
# suppose we want the the values for the variable LOCATION to be N for LOCATON =1 and
# S for LOCATION= 2. We also want the values for the variabes P1-P4 to be
# 1 = "Very Heavy" 2 ="Heavy" 3 = "Moderate" 4 = "Light" 5 ="Very Light"
# and we want the variabe names P1-P4 to be "PLOT 1" - "PLOT 4"
# STEP 1: change values for variable LOCATION
plots2$LOCATION <- factor(plots2$LOCATION,levels=c(1,2,3,4),labels=c("N", "S", "E", "W"))
# STEP 2: change values for the variables P1-P4
myPlevels<-c(1,2,3,4,5) # set the # of levels
myPlabels <-c("Very Heavy","Heavy","Moderate","Light","Very Light") # labels
plots2$P1 <- factor(plots2$P1, myPlevels, myPlabels) # note we are actally changing the values for P1-P4
plots2$P2 <- factor(plots2$P2, myPlevels, myPlabels) # vs creating varables to describe the values
plots2$P3 <- factor(plots2$P3, myPlevels, myPlabels)
plots2$P4 <- factor(plots2$P4, myPlevels, myPlabels)
# STEP 3: change the VARIABLE labels for P1-P4
names(plots2)[3:6] <- c(
"PLOT 1",
"PLOT 2",
"PLOT 3",
"PLOT 4")
# print re-formatted data set
plots2
# functions
# create function myvar.port that iterates throught the weights and calculates portfolio variance
myvar.port <- function(nWeights) {
sigma.z <- matrix(c(0), nrow = nWeights) #initialize - we will compute 11 standard deviations
for (i in 1:nWeights){
sigma.z <- sqrt((Wa)^2*(std.a)^2 + (Wb)^2*std.b^2 + 2*p.ab*Wa*Wb*std.a*std.b)
}
return (sigma.z)
}
# example: produce summary statistics for each category in a field
cats <- as.vector(tmp$Var1) # extract measure ids as a list
for(i in 1:19) {
print(cats[i])
print(summary(complications$Score[complications$Measure_ID ==cats[i]]))
}
#--------------------------------------------------
# dealing with scientific notation
#--------------------------------------------------
x <- 1.82*(10^-7)
y <- format(x, scientific = FALSE)
# or even better:
options(scipen = 999)
#----------------------------------------------------
# PROPENSITY SCORE MATCHING
#---------------------------------------------------
######################################################
# example from MatchIt documentation
#####################################################
# example data set is a subset of the job training program analyzed in Lalonde (1986) and Dehejia and Wahba (1999).
# MatchIt includes a subsample of the original data con- sisting of the National Supported Work Demonstration (NSW)
# treated group and the comparison sample from the Population Survey of Income Dynamics (PSID).1 The variables in this data
# set include participation in the job training program (treat, which is equal to 1 if participated in the program, and 0 otherwise),
# age (age), years of education (educ), race (black which is equal to 1 if black, and 0 otherwise;
# hispan which is equal to 1 if hispanic, and 0 otherwise), marital status (married, which is equal to 1 if married, 0 otherwise),
# high school degree (nodegree, which is equal to 1 if no degree, 0 otherwise), 1974 real earnings (re74), 1975 real earnings
# (re75), and the main outcome variable, 1978 real earnings (re78)
head(lalonde)
dim(lalonde)
names(lalonde)
# "treat" "age" "educ" "black" "hispan" "married" "nodegree" "re74" "re75" "re78"
####################################################
# nearest neighbor matching
####################################################
# Matching is done using a distance measure specified by the distance option (default=logit).
# Matches are chosen for each treated unit one at a time, with the order specified by the m.order command (default=largest to smallest).
# At each matching step we choose the control unit that is not yet matched but is closest to the treated unit on the distance measure.
m.out1 <- matchit(treat ~ re74 + re75 + age + educ, data = lalonde, method = "nearest", distance = "logit")
summary(m.out1) # check balance
m.data1 <- match.data(m.out1,distance ="pscore") # create ps matched data set
head(m.data1) # view
# perform paired t-tests (not in documentation)
t.test(m.data1$re78[m.data1$treat==1],m.data1$re78[m.data1$treat==0],paired=TRUE)
t.test(lalonde$re78[lalonde$treat==1],lalonde$re78[lalonde$treat==0],paired = FALSE)
# export data to compare in SAS or perform additional analysis
write.csv(lalonde, file = "//Documents//Briefcase///Data//lalonde.csv")
# ex
write.csv(w1, file = "r goals.csv") # assumes assigned directory or default directory
write.csv(m.data1, file = "//Documents//Briefcase///Data//lalonde_nearest.csv")
# ex without rownames
write.csv(opioid_dat, file = "//Documents//Briefcase///Data//opioid_dat.csv", row.names = FALSE)
#---------------------------------
# working with dates
#--------------------------------
# calculate difference in dates
# ex 1
survey$date_diff <- as.Date(as.character(survey$date), format="%Y/%m/%d")-as.Date(as.character(survey$tx_start), format="%Y/%m/%d")
survey
# ex 2
df$diff_in_days<- difftime(df$datevar1 ,df$datevar2 , units = c("days"))
# application
temp3_dates$diff_days <- abs((as.Date(as.character(temp3_dates$Measure_Start_Date), format="%m/%d/%Y")-as.Date(as.character(temp3_dates$Measure_End_Date), format="%m/%d/%Y")))
# or
temp3_dates$diff_days <- abs((difftime(as.Date(as.character(temp3_dates$Measure_Start_Date), format="%m/%d/%Y"),as.Date(as.character(temp3_dates$Measure_End_Date), format="%m/%d/%Y") , units = c("days"))))
# create date format
df1$CovEffDate <- as.Date(df1$Coverage_Effective_Date, "%Y-%m-%d")
# sometimes you need to supply the origin if your data is numeric
as_datetime(y, origin = lubridate::origin)
# I have also seen
as.Date(17536,origin="1970-01-01")
# get min and max dates
tmp <- df1%>%
summarize(mindate = min(CovEffDate, na.rm=TRUE),
maxdate = max(CovEffDate, na.rm=TRUE))
## first days of years
seq(as.Date("1910/1/1"), as.Date("1999/1/1"), "years")
## by month
seq(as.Date("2000/1/1"), by = "month", length.out = 12)
## quarters
seq(as.Date("2000/1/1"), as.Date("2003/1/1"), by = "quarter")
## find all 7th of the month between two dates, the last being a 7th.
st <- as.Date("1998-12-17")
en <- as.Date("2000-1-7")
ll <- seq(en, st, by = "-1 month")
rev(ll[ll > st & ll < en])
# application: calculate age at index date for an observational study
df1$BirthDate <- as.character(df1$DOB) # format as character
df1$BirthDate <- as.Date(df1$BirthDate, "%Y-%m-%d") # format as date
df1$age <- round(difftime(df1$index_date,df1$BirthDate, units = c("days"))/(365.25)) # calculate age
df1$age <- as.numeric(df1$age) # make sure this is numeric
# calculate age using libridate function
df1$BirthDate <- as.character(df1$DOB) # format as character
df1$BirthDate <- mdy(df1$BirthDate) # apply date format (lubridate function)
df1$age <- round(difftime(df1$index_date,df1$BirthDate, units = c("days"))/(365.25)) # calculate age
df1$age <- as.numeric(df1$age) # make numeric
#----------------------------
# sample training and validation
#---------------------------
# store total number of observations in your data
N <- 400
print(N)
# Number of training observations
Ntrain <- N * 0.5
print(Ntrain)
# add an explicit row number variable for tracking
id <- seq(1,400)
apps2 <- cbind(apps,id)
# Randomly arrange the data and divide it into a training
# and test set.
dat <- apps2[sample(1:N),]
train <- dat[1:Ntrain,]
validate <- dat[(Ntrain+1):N,]
dim(dat)
dim(train)
dim(validate)
# sort and look at data sets to see that they are different
sort_train <- train[order(train$id),]
print(sort_train)
sort_val <- validate[order(validate$id),]
print(sort_val)
# random sample
randomSample = function(df,n) {
return (df[sample(nrow(df), n),])
}
rs <- randomSample(tmp,5)
# random sample (with set.seed for repeatability)
randomSample = function(df,n) {
set.seed(123)
return (df[sample(nrow(df), n),])
}
randomSample(tmp,5) # first call
randomSample(tmp,5) # second call
#-----------------------------------
# bootstrap
#-----------------------------------
# regression
summary(fit <- lm(GARST ~ PIO, data = yield_data))
# boot strap for regression (this may not be accurate with such small sample)
bstrap <- c()
for (i in 1:10000) {
newsample <- yield_data[sample(nrow(yield_data), 5, replace = T), ]
bstrap <- c(bstrap, as.vector(coef(lm(GARST ~ PIO,newsample))[2])) }
hist(bstrap)
summary(bstrap)
sd(bstrap, na.rm = TRUE) # SE
quantile(bstrap, c(.025,.975),na.rm = TRUE)
#------------------------------------
# transpose or reshape data
#------------------------------------
# reference: https://stats.idre.ucla.edu/r/faq/how-can-i-reshape-my-data-in-r/
# create example data
id <- c(1,1,1,2,2,2,3,3,3)
measure <- c("depth","temp","width","depth","temp","width","depth","temp","width")
values <- c(2,50,18,1.5,53,18,2.5,60,18)
dat <- data.frame(id,measure,values)
# transpose panel data to wide format
datwide <- reshape(dat,
timevar = "measure", # the panel variable(s) we want to be new columns
idvar = c("id"), # vars we want to keep constant
direction = "wide")
# collapse wide data into panel
datnarrow <- reshape(datwide,
varying = c("depth","temp","width"), # things we want to collapse into a single column
v.names = "value", # name for new column that will hold panel of values
timevar = "measure", # name for new column that collapses old variable names into values
times = c("depth","temp","width"), # old variable names that become new values
new.row.names = 1:1000,
direction = "long")
names(datwide) <- gsub("values.", "", names(datwide )) # cleanup names
#----------------------------
# check duplicates
#---------------------------
# check duplication
tmp <- data.frame(table(temp1_stars$Provider_ID))
summary(tmp)
# remove duplicates or get distinct
DEK <- c(150,150,145,140,144,148,152,156,160,164)
PLOT <- c(1,1,1,1,5,6,7,8,9,10)
BT <- c('Y','Y','Y','N','N','N','Y','N','Y','Y')
dat <- data.frame(DEK,PLOT,BT)
# how many unique values of BT?
length(levels(dat$BT))
# how many unique plot values
length(unique(dat$PLOT))
# this data is duplicated in many ways
deduped.dat <- unique(dat[,1:3 ] ) # gets unique based on all 3 fields
# dplyr offers several options
deduped.dat <- dat%>%distinct(PLOT, .keep_all = TRUE) # based on PLOT
deduped.dat <- dat%>%distinct(PLOT,BT, .keep_all = TRUE) # based on PLOT and BT
deduped.dat <- dat%>%distinct(PLOT,BT,DEK, .keep_all = TRUE) # truly unique based on all listed fields
deduped.dat <- dat%>% distinct # same as above without listing fields related to duplication
# get distinct instance based on ID and pre term flag = 0
tmp1 <- tmp1_births%>%arrange(MBR_ID,pre_term)%>%distinct(MBR_ID, .keep_all = TRUE)
tmp2 <- data.frame(table(tmp1$MBR_ID))
tmp4 <- tmp1[tmp1$MBR_ID =='10060',]
# get distinct last instance based on ID and pre term flag = 1
tmp1 <- tmp1_births%>%arrange(MBR_ID,desc(pre_term))%>%distinct(MBR_ID, .keep_all = TRUE)
tmp2 <- data.frame(table(tmp1$MBR_ID))
tmp4 <- tmp1[tmp1$MBR_ID =='10060',]
# correcting plot boundaries
par(mar=c(1,1,1,1))
# example routing checking and correcting duplicates based on key and 1 addiitonal field
# check duplicates
tmp1 <- data.frame(table(df2$MBR_ID)) # find duplicates
tmp <- df2[df2$MBR_ID =='10060',] # appears duplicated on diagnosis &/or claim #
# get distinct sorting to keep most recent diagnosis (judgement call)
tmp2 <- df2%>%arrange(MBR_ID,desc(SVC_FROM_DT))%>%distinct(MBR_ID, .keep_all = TRUE)
tmp3 <- tmp2[tmp2$MBR_ID =='10060',] # check
# update data frame
df2 <- tmp2
rm(tmp,tmp1,tmp2,tmp3) # cleanup
# example referencing against a list that is a random sample of numeric IDs
# note: change formatting in 'vals' if character IDs are required
# check duplicates
tmp1 <- data.frame(table(df5$ID)) # find duplicates
tmp <- df5[df5$ID =='80050',] # appears duplicated on flag #
# check against a random sample of duplictes
rs <- randomSample(tmp1[tmp1$Freq > 1,],5) # look at random sample of duplicates
vals <- as.list((as.numeric(as.character(rs$Var1)))) # get ID as list from rs
tmp <- df5[df5$ID %in% vals,] # subset data with duplicated rows - check
# get distinct sorting to give priority to pre-term flag = 1 (judgement call)
tmp2 <- df5%>%arrange(ID,desc(pre_term))%>%distinct(ID, .keep_all = TRUE)
tmp3 <- tmp2[tmp2$ID %in% vals,] # check again - all pre_term = 1 obs should be kept
# update data frame
df5 <- tmp2
rm(tmp,tmp1,tmp2,tmp3,rs,vals) # cleanup
#-----------------------------
# missing values
#-----------------------------
# count total missing values in a data frame
colSums(is.na(df1_chrt))
### example recode missing category
df1$Lvl <- ifelse(is.na(df1$Level) == TRUE,'1-Low-Risk',df1$Lvl)
### example - categorizing with a missing value category
df1$riskcat <- ifelse(df1$Risk_Score <= 3,'1-Low',
ifelse(df1$Risk_Score > 3 & df1$Risk_Score <= 5, '2-Moderate', '3-High'))
# account for NA group
df1$riskcat <- ifelse(is.na(df1$Risk_Score) == TRUE,'0-NA',df1$riskcat)
# remove missing based on specified field
tmp3 <- tmp3[is.na(tmp3$RUCC_2013) == 'FALSE',]
tmp2 <- na.omit(tmp1) # listwise deletion of all missing
# basic imputation
impute.mean <- function(x) {
z <- mean(x, na.rm = TRUE)
x[is.na(x)] <- z
return(x)
}
apply(df$xvar,2,impute.mean)
impute.zero <- function(x) {
x[is.na(x)] <- 0
return(x)
}
df1 <- data.frame(apply(df,2,impute.zero)) # this imputes all missing column values to zero
#--------------------------
# loop processing
#-------------------------
# loop across fields and produce frequencies
results=list() # create list for storing results
# run loop and simulate data
for(i in 1:5) {
results[i] = sample(1:15, 1)
}
x <- as.numeric(as.character(unlist(results))) # convert results to object
summary(x) # analysis
#--------------------------------------
# working with functins
#--------------------------------------
apply(df,2,sd) # calculate standard deviation for every column in data set
df2 <- sapply(df1,as.numeric) # convert all columns to numeric
#-----------------------------------
# substring text and character operations
#-----------------------------------
# if ENDT still NULL and there is an coverage effective date then replace null end date with today's date
df1$ENDT <- ifelse((substr(df1$ENDT,1,1) =="N" & (substr(df1$DateVar,1,1) != "N")),as.character.Date("2018-10-14"),df1$ENDT)
# remove first character
df1$ID <- substring(df$ID, 2)
# remove last character
df1$ID <- substring(df1$ID,1,nchar(df1$ID)-1) # last quote
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment