BioSciEconomist · June 17, 2024 12:29
diff --git a/R Code Pallete.R b/R Code Pallete.R
 # *-----------------------------------------------------------------
 # | PROGRAM NAME: R Code Pallete
 # | DATE: 3/11/20
 # | CREATED BY: MATT BOGARD 
 # | PROJECT FILE:             
 # *----------------------------------------------------------------
 # | PURPOSE: 
 # *----------------------------------------------------------------


 #----------------------------------
 # documentation templates
 # ---------------------------------


 # *-----------------------------------------------------------------
 # | PROGRAM NAME: 
 # | DATE: 
 # | CREATED BY: 
 # | PROJECT FILE:             
 # *----------------------------------------------------------------
 # | PURPOSE: 
 # *----------------------------------------------------------------


 ###############################################################################
 ###############################################################################
 #################          construction zone      #############################
 ###############################################################################
 ###############################################################################


 ##
 ## new section
 ## 


 #  *-----------------------------------------------------------------*
 #  |
 #  |
 #  |
 #  |  reading data
 #  |
 #  |
 #  |
 #  |
 #  *------------------------------------------------------------------*



 # identify location of data used for project by specifying a working directory
 # alternatively, if data is stored in numerous places, you can reference
 # the file locaton directly

 rm(list=ls()) # get rid of any existing data 

 options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

 cat("\f") # clear console

 clc <- function() cat(rep("\n", 50))

 rm("temp") # remove specific data frames

 ls() # view open data sets (should be empty if you just ran the code above)

 detach("package:vegan", unload=TRUE) # unload packages that we don't need or that mask other functions

 # sometimes we may want to unload different versions of packages - the function below handles this + 
 # provides a specific example of how Matchit loads MASS which maskes dplyer

 # function to detach MASS package which masks the 'select' operation within dplyr used for merging data
 # we use dplyr extensively to process data but later in the analysis using the MASS package to estimate
 # negative binomial models (for utilization counts like ER visits) it 'masks' or prevents 'select' from working
 # this is a problem if we were previously using MASS (or Matchit which leverages MASS) in an analysis and then wanted to run this program
 # so we want to 'unload' or 'detach' any versions of MASS which may be loaded before trying to build the data
 # for the analysis below leveraging dplyr functions like 'select.' This is unnecessary if MASS has not been loaded

 # see also: https://stackoverflow.com/questions/6979917/how-to-unload-a-package-without-restarting-r 

 detach_package <- function(pkg, character.only = FALSE)
 {
  if(!character.only)
  {
    pkg <- deparse(substitute(pkg))
  }
  search_item <- paste("package", pkg, sep = ":")
  while(search_item %in% search())
  {
    detach(search_item, unload = TRUE, character.only = TRUE)
  }
 }

 detach_package("MatchIt", TRUE) # first have to unload 'MatchIt' because it imports 'MASS'
 detach_package("MASS", TRUE) # unload/detach MASS


 # view specific columns
 View(df1[,100:103])

 str(df1) # variable names and formats
 str(df1, list.len=ncol(df1)) # in case R truncates the output above


 # set working directory

 setwd('/Users/wkuuser/Desktop/R Data Sets') # mac 

 setwd("P:\\R  Code References\\R Training") # windows


 #  *------------------------------------------------------------------*
 #  | make R talk on a macbook
 #  *------------------------------------------------------------------* 

 b <- sprintf("say Pumpkin pie is the best")
 system(b, intern = FALSE, ignore.stdout = FALSE, ignore.stderr =
         FALSE, wait = TRUE, input = NULL)


 #  *--------------------------------------------------------------------------------------------------*
 #  | direct reading of a single variable vector from command line- similar to cards statement in SAS
 #  *------------------------------------------------------------------------------------------------*


 GARST <- c(150,140,145,137,141,145,149,153,157,161) # read in specified values for the variable GARST
 print(GARST) 

 GARST <- c(150,140,145,137,141,145,149,153,157,161) 
 PIO <- c(160,150,146,138,142,146,150,154,158,162)
 MYC <- c(137,148,151,139,143,120,115,136,130,129)
 DEK <- c(150,149,145,140,144,148,152,156,160,164)
 PLOT <- c(1,2,3,4,5,6,7,8,9,10)
 BT <- c('Y','Y','N','N','N','N','Y','N','Y','Y')
 RR <- c('Y','N','Y','N','N','N','N','Y','Y','N')

 yield_data <- data.frame(GARST,PIO,MYC,DEK,PLOT,BT,RR)

 rm(GARST,PIO,MYC,DEK,PLOT,BT,RR) # cleanup

 # scatter plot
 plot(yield_data$GARST,yield_data$PIO)

 # this is even more like the cards statement in SAS

 toread <- "id sex age inc r1 r2 r3 
 1  F  35 17  7 2 2 
 17  M  50 14  5 5 3 
 33  F  45  6  7 2 7 
 49  M  24 14  7 5 7 
 65  F  52  9  4 7 7 
 81  M  44 11  7 7 7 
 2   F  34 17  6 5 3 
 18  M  40 14  7 5 2 
 34  F  47  6  6 5 6 
 50  M  35 17  5 7 5" 

 survey <- read.table(textConnection(toread), header = TRUE) 
 closeAllConnections()

 #--------------------------------------------------
 # reading a file from the web
 #---------------------------------------------------


 library(data.table)
 mydat <- fread('https://raw.githubusercontent.com/dincerti/dincerti.github.io/master/data/mepsdiab.csv')
 head(mydat)




 #  *------------------------------------------------------------------*
 #  | reading data from a txt file
 #  *------------------------------------------------------------------* 


 # data looks like this:
 # GARST	PIO	MYC	DEK	PLOT BT	RR
 # 150	160	137	150	1	 Y	Y
 # 140	150	148	149	2	 Y	N
 # 145	146	151	145	3	 N	Y
 # 137	138	139	140	4	 N	N
 # 141	142	143	144	5	 N	N	
 # 145	146	120	148	6	 N	N
 # 149	150	115	152	7	 Y	N
 # 153	154	136	156	8	 N	?
 # 157	158	130	160	9	 Y	Y
 # 161	162	129	164	10	 Y	N


 yield_data<-read.table("Yield_plots.txt", header=TRUE) # read text file
 names(yield_data) # var list
 print(yield_data) # print data set
 yield_data # note: print() is not necessary to print

 # get all factor variables

 is.fact <- sapply(yield_data, is.factor)
 tmp <- yield_data[,is.fact]

 #  *------------------------------------------------------------------*
 #  | reading data from a csv file
 #  *------------------------------------------------------------------* 


 # data looks like this
 # HYBRID TRAIT	P1	P2	P3
 # GARST  RR	    150	140	145
 # MYC    BT	    160	150	146
 # PIO    RR	    137	148	151
 # DEK    BT 	150	149	145


 plots <- read.csv("plots.csv", na.strings=c(".", "NA", "", "?"), encoding="UTF-8")
 plots


 # create function to reformat column names from csv files (from:https://stackoverflow.com/questions/17152483/how-to-replace-the-in-column-names-generated-by-read-csv-with-a-single-spa )

 makeColNamesUserFriendly <- function(ds) {
  # FIXME: Repetitive.
  
  # Convert any number of consecutive dots to a single space.
  names(ds) <- gsub(x = names(ds),
                    pattern = "(\\.)+",
                    replacement = "_")
  
  # Drop the trailing spaces.
  names(ds) <- gsub(x = names(ds),
                    pattern = "( )+$",
                    replacement = "")
  ds
 }

 # create unique ID for each line in data frame
 tmp_claims$RowID <- 1:nrow(tmp_claims) 

 # convert index to column
 names <- rownames(yield_data)
 yield_data <- cbind(names,yield_data)

 # repair read error for first ID (somethimes csv file read cause the first element in file to misread)
 df$ID <- as.character(df$ID)
 df[1,1] <- '123456'


 #  *------------------------------------------------------------------*
 #  | reading data from a excel xlsx
 #  *------------------------------------------------------------------* 

 xlsxFile <- "C:\\Users\\mtb2901\\Documents\\Data Science Projects\\Projects 2018\\201801 Hospital Risk Profile\\Raw Data\\TX-Hospital-Tricare-Network.xlsx"

 # reference: https://www.rdocumentation.org/packages/openxlsx/versions/4.0.17/topics/read.xlsx
 # read.xlsx(xlsxFile, sheet = 1, startRow = 1, colNames = TRUE,
 # rowNames = FALSE, detectDates = FALSE, skipEmptyRows = TRUE,
 # skipEmptyCols = TRUE, rows = NULL, cols = NULL, check.names = FALSE,
 # namedRegion = NULL, na.strings = "NA", fillMergedCells = FALSE)

 vars <- read.xlsx("FraudT3CB14jun18HEADER.xlsx",sheetIndex = 1) # read header file with variable names


 #-------------------------------------
 # writing files
 #--------------------------------------


 # ex without rownames
 write.csv(opioid_dat, file = "//Documents//Briefcase///Data//opioid_dat.csv", row.names = FALSE)

 # export as R data file 
 save(MBR_FACT_ANLY, file = "//projects//Data//MBR_FACT_ANLY.RData")

 # export as text with tab dilemeter
 write.table(df1, //Projects//Data//df1.txt",row.names = FALSE, sep="\t")



 #  *-----------------------------------------------------------------*
 #  |
 #  |
 #  |
 #  |  printing and subsetting data
 #  |
 #  |
 #  |
 #  |
 #  *------------------------------------------------------------------*


 # print selected variables

 yield_data[c("GARST")]   

 # subset data via variable selection  

 my_hybrids <- yield_data[ c("GARST", "PIO")]

 print(my_hybrids)

 # subset based on a single variable

 GARST<-yield_data[c("GARST")]
 print(GARST)

 df  <- df[ , !names(df) %in% c("xvar")] # drop field

 # print a subset of data based on observed variable values

 print(yield_data[yield_data$GARST==150 & yield_data$PIO==160,])

 # subset data based on observed values

 high_yields <- yield_data [ yield_data$GARST==150 & yield_data$PIO==160,]

 print(high_yields)

 stacked_traits <-yield_data[ yield_data$BT =="Y" & yield_data$RR =="Y",] 
 stacked_traits


 # subset based on just one variable value

 LOW_GARST<-yield_data[GARST==140,]
 print(LOW_GARST)

 LOW_GARST<-yield_data[GARST < 150,]
 print(LOW_GARST)

 RR<- yield_data[yield_data$RR =="Y",]
 RR

 Bt<- yield_data[yield_data$BT=="Y",]
 Bt

 # subset based on %in%

 vc <- c(150,145,195)
 tmp <- yield_data[yield_data$GARST %in% vc,]

 # or
 tmp <- yield_data[yield_data$GARST %in% c(150,145,195,161),]

 # this might also works using dplyr (which allows for additional manipulation if you want to
 # 'pipe' in additional aggregations or calculations)

 library(dplyr)

 tmp <- yield_data %>% 
  filter(GARST %in% c(150,145,195))
  
 # subset based on 'not' %in% using dplyr
 tmp <- filter(yield_data,!BT %in% c('Y'))

 library(sqldf)

 tmp <- sqldf('select * from yield_data where GARST in(150,145,195)')

 #------------------------------------------------
 # connect to SQL server 
 #------------------------------------------------

 # read initial analysis file 
 library(RODBC)

 # connecct to the SQL Server LOUSQLWTS853
 DB_Connection <- odbcDriverConnect('driver={SQL Server};server=arlsql123;database=ANLY_SBOX;trusted_connection=true')

 # connect to specific table (see documentation to limit rows etc.)
 MBR_FACT_ANLY <- sqlFetch(DB_Connection, "mbr_fact_anly")

 # export as R data file for future use(so we don't have to hit SQL every time)
 save(MBR_FACT_ANLY, file = "//projects//mtt94674//Data//MBR_FACT_ANLY.RData")

 # make sure to close your database connection
 close(DB_Connection)

 #  *-----------------------------------------------------------------*
 #  |
 #  |
 #  |
 #  |   summary statistics
 #  |
 #  |
 #  |
 #  |
 #  *------------------------------------------------------------------*


 summary(yield_data) # whole data set

 summary(yield_data[c("GARST")])  # single variable

 summary(yield_data$GARST)  # single variable (more efficient)

 summary(yield_data[c("GARST" , "PIO")]) # selected variables

 summary(sqrt(yield_data$GARST) ) # transformation 


 # by processing

 by(yield_data, yield_data$BT, summary)

 # using dplyr
 yield_data%>% 
  group_by(BT) %>%     # group by s
  summarize(mean = mean(GARST))  # mean 

 ### more examples with dplyr

 # how many members per risk group (members are duplicated by transactions)
 # as such the same member can have more than one transaction scored by the model
 # that are in the same risk group and a member can belong to multiple 
 # risk groups if they have multiple transactions scored accordingly 

 ### brute force example
 tmp <- df4%>%filter(riskcat == '1:LOW')%>%distinct(mbrid)
 tmp <- df4%>%filter(riskcat == '2:MOD')%>%distinct(mbrid)
 tmp <- df4%>%filter(riskcat == '3:HI')%>%distinct(mbrid)
 tmp <- df4%>%filter(riskcat == '4:VHI')%>%distinct(mbrid)

 ### a little less brute force
 tmp1 <- df4%>%select(riskcat,mbrid)%>%arrange(riskcat,mbrid) # sort by risk and ID
 tmp2 <-tmp1%>%group_by(riskcat)%>%distinct(mbrid) # get unique IDs per risk strata

 # how many members per strata
 tmp2%>%  
  group_by(riskcat)%>% 
  summarize(total = n())

 ### can we do the things above in one step
 df4%>%select(riskcat,mbrid)%>%arrange(riskcat,mbrid)%>%group_by(riskcat)%>%distinct(mbrid)%>% 
  summarize(total = n())

 library(Hmisc)

 # frequencies
 library(Hmisc)
 describe(yield_data)

 # contents of data set  - (Hmisc) 
 contents(yield_data)


 # aggregations and crosstabulations

 summary(ACTIV_PARTICIP ~ AGECAT, data=hc_mbr_anly, fun=mean)

 # tables

 tmp <- data.frame(table(complications$Measure_ID)) # saved as an easily readable data frame

 aggregate(ACTIV_PARTICIP~RUCC_2013_DESC,hc_mbr_anly,mean)

 CROP <- c('Cotton','Cotton','Corn','Corn','Corn','SB','SB','SB','SB','SB')
 RR <- c('Y','N','Y','N','N','N','N','Y','Y','N')

 yield_data <- data.frame(CROP,RR)

 rm(CROP,RR) # cleanup

 mytable <-table(yield_data$RR,yield_data$CROP)

 tmp <- data.frame(mytable)
 names(tmp) <-  c("RR","Crop","Total")
 print(tmp)

 tmp <- data.frame(prop.table(mytable)) # cell percentages
 names(tmp) <- c("RR","Crop","Pct")
 print(tmp)

 tmp <- data.frame(prop.table(mytable, 1)) # row percentages (%Crop within RR status)
 names(tmp) <- c("RR","Crop","RowPct")
 print(tmp)

 tmp <- data.frame(prop.table(mytable, 2)) # column percentages (%RR within Crop)
 names(tmp) <- c("RR","Crop","ColPct")
 print(tmp)

 # cross tabulations across a range of variables

 # average garst across BT and RR

 is.fact <- sapply(yield_data, is.factor)
 tmp <- yield_data[,is.fact]
 vars <- names(tmp)


 for(i in 1:2) {
  print("+-+-+-+-+-+-+-+-+-+-+-+-")  
  print(vars[i])
  print("+-+-+-+-+-+-+-+-+-+-+-+-") 
  print(by(yield_data$GARST, yield_data[c(vars[i])],mean))
 }

 # cross tablulations across a range of columns in a data frame
 names(sb) # list columns for reference
 xvar <- as.list(names(sb)) # save names as a list to reference in loop

 # loop across fields and produce frequencies
 for(i in 1:36) {
  print(xvar[i]) 
  print(data.frame(table(sb[c(paste(xvar[i]))])))
 }

 aggregate(totmthcov~pre_term,df1,mean)

 # using dplyr
 df1%>% 
  summarize(mean = mean(prob,na.rm = TRUE),
            min = min(prob,na.rm = TRUE),
            Q1=quantile (prob, probs=0.25,na.rm = TRUE),
            med = median(prob,na.rm = TRUE), 
            Q3=quantile(prob, probs=0.75,na.rm = TRUE),
            P85=quantile(prob, probs=0.85,na.rm = TRUE),
            P90=quantile(prob, probs=0.90,na.rm = TRUE),
            P95=quantile(prob, probs=0.95,na.rm = TRUE),
            P99=quantile(prob, probs=0.99,na.rm = TRUE))



 # aggregate totals per person
 df5 <- df4%>% 
  group_by(ID) %>%
  summarize(TOTAL_ALLOW_PRE = sum(TOT_ALLOWED_AMT))  

 xtabs(~ BT, data = yield_data) # counts of types of BT
 xtabs(~ BT + RR, data = yield_data) # BT by RR 
 xtabs(GARST ~ RR, data=yield_data)  # total RR for GARST

 # categorical frequencies by treatment status

 mytable <- table(df1$PrimaryDiagDesc,df1$treat)  
 prop.table(mytable, 2) # column percentage



 #  *-----------------------------------------------------------------*
 #  |
 #  |
 #  |
 #  |  creating and adding new variables
 #  |
 #  |
 #  |
 #  |
 #  *------------------------------------------------------------------*


 # create a new data set that includes a new variable that is 
 # the sum of  "garst" and "pio"


 yield_data2<-transform(yield_data, sum=GARST+PIO)

 print (yield_data2)

 # add a new variable to existing data set that is
 # the sum of all hybrids

 #first create data set HYBRIDS

 hybrids<-yield_data
 print(hybrids)

 # add new variable to hybrids
 hybrids<-transform(hybrids, SUM=GARST+PIO+MYC+DEK)
 print(hybrids)

 # or add 2 new variables

 hybrids<-transform(hybrids, SUM=GARST+PIO+MYC+DEK,
                   MEAN=SUM/4)

 print(hybrids)

 # combine or concatenating two columns

 yield_data$x <- paste(yield_data$RR,yield_data$BT)

 yield_data$x <- paste(yield_data$RR, "-",yield_data$BT)
 df$x <- paste(df$n,"-",df$s) for inserting a separator.

 #  *-----------------------------------------------------------------*
 #  |
 #  |
 #  |
 #  |  conditional processing 
 #  |
 #  |
 #  |
 #  |
 #  *------------------------------------------------------------------*



 # recall from the data set there are 10 plots
 # first assume that in plot 3 we accidentally planted a Bt 
 # garst hybrid that was supposed to be a non-Bt hybrid
 # so we don't want to include it in the non Bt
 # average for hybrids- we can use 'ifelse' conditions
 # to exclude this from the mean calculation

 # first create data set trials

 trials<-yield_data
 print(trials) 
 print(trials$PLOT)


 trials$AVG <- ifelse ( trials$PLOT == 3 ,
                       
                       (trials$AVG <- (trials$PIO+trials$MYC+trials$DEK)/3), # excludes GARST in the calculation if PLOT = 3
                       (trials$AVG <- (trials$GARST+trials$PIO+trials$MYC+trials$DEK)/4 ) ) # for all other plots
 # creates new variable AVG in data set trials

 trials

 # another example - notice the nexting of the ifelse conditions:

 grades$letter <- ifelse(grades$score >= .90,"A", ifelse(grades$score >= .80 & grades$score < .90, "B",ifelse (grades$score >= .70 & grades$score < .80,"C",ifelse(grades$score >= .60 & 
                                                                                                                                                                    grades$score < .70, "D","F"))))
 table(grades$letter)

 # create binary target
 banking$target <- ifelse(banking$y =="no",0,1)

 # model calibration / risk stratification

 df1$risk <- ifelse(df1$prob < .12,"1:LOW", 
                   ifelse(df1$prob >= .12 & df1$prob < .30, "2:MOD",
                          ifelse(df1$prob >= .3 & df1$prob < .60, "3:HI","4:VHI")))

 table(df1$risk)

 # age categorization

 df1$AgeCat <- ifelse(df1$age < 18,'Under 18',
                         ifelse(df1$age >= 18 & df1$age <= 25,'18-25',
                                ifelse(df1$age > 25 & df1$age <= 29, '26-29',
                                       ifelse(df1$age > 29 & df1$age <= 39,'30-39',
                                              ifelse(df1$age > 39 & df1$age <= 49, '40-49',
                                                     ifelse(df1$age > 49 & df1$age <= 59, '50-59',
                                                            ifelse(df1$age >59 & df1$age <= 64, '60-64','65-65+'))))))) 



 #  *-----------------------------------------------------------------*
 #  |
 #  |
 #  |
 #  |  stacking / concatenating/ adding data sets
 #  |
 #  |
 #  |
 #  |
 #  *------------------------------------------------------------------*


 # get only bt hybrids- same as subsetting

 BT_HYBRIDS <- yield_data[yield_data$BT=="Y", ]
 print(BT_HYBRIDS)

 # get only non bt hybrids

 NON_BT_HYBRIDS<- yield_data[yield_data$BT=="N",]
 print(NON_BT_HYBRIDS)

 #  *------------------------------------------------------------------*
 #  | combine with the rbind function
 #  *------------------------------------------------------------------* 


 both<-rbind(BT_HYBRIDS,NON_BT_HYBRIDS)
 print(both)


 #  *------------------------------------------------------------------*
 #  | stack with SQL using sqldf
 #  *------------------------------------------------------------------* 

 library(sqldf)
 library(ggplot2)

 both_sql = sqldf('
                 select  *
                 from BT_HYBRIDS union all select * from NON_BT_HYBRIDS 
                 order by PLOT
                 ')
 # note: the 'all' statement keeps all rows, if you omit 'all' you get only unique rows	
 both_sql


 #  *-----------------------------------------------------------------*
 #  |
 #  |
 #  |
 #  |
 #  |		merging based on common variables
 #  |
 #  |
 #  |
 #  *------------------------------------------------------------------*


 # first split the data, one set contains plot, bt, rr, garst, pio
 # the other set has plot, rr, bt, myc, dek
 # we'll work under the assumption that the first set is 'early'
 # hybrids and the second set is 'late' hybrids
 # as if we collected data on these separately and want to combine the 
 # data set to get one set, like we started with with yield_data


 # create early / late planting data set

 early<-yield_data[c("PLOT","GARST","PIO","BT","RR")]
 early

 late<-yield_data[c("PLOT","MYC","DEK","BT","RR")]
 late

 #  *------------------------------------------------------------------*
 #  | merge by common variables- PLOT, BT, RR
 #  *------------------------------------------------------------------* 

 hybrids<-merge(early,late,by=c("PLOT","BT","RR"))

 hybrids

 #  *------------------------------------------------------------------*
 #  | merging by a single common variable
 #  *------------------------------------------------------------------* 


 # the above merges back to the orignianl data set, but 
 # we had to be careful to merge by ALL of the common variables
 # suppose we had the following data set strip_trial.txt

 # HYBRID	YIELD	TRAIT
 # GARST	150	BT
 # MYC	140	RR 
 # PIO	160	RR
 # DEK	145	BT

 # lets read it in and split it up to form 2 data sets 'yields'
 # and 'traits' with a SINGLE common variable 'HYBRID' to merge by

 #'yields'

 # HYBRID	YIELD	
 # GARST	150	
 # MYC	140	
 # PIO	160	
 # DEK	145	


 # 'traits'

 # HYBRID	TRAIT	
 # GARST	BT	
 # MYC	RR	
 # PIO	RR	
 # DEK	BT

 # read data 'yields'

 yields <- read.table("yields.txt", header=TRUE) # read text file
 yields

 # read data 'traits'

 traits <- read.table("traits.txt", header=TRUE) # read text file
 traits

 # now to demonstate merging data sets by a common variable

 field_data<-merge(yields,traits,by=c("HYBRID"))
 field_data 

 #  *------------------------------------------------------------------*
 #  | merging with SQL using sqldf
 #  *------------------------------------------------------------------* 

 # library(sqldf)
 # library(ggplot2)

 # left join yields and traits on HYBRID
 field_data_sql = sqldf('
                       select  a.HYBRID,a.YIELD,b.TRAIT
                       from yields a left join traits b
                       on a.HYBRID =b.HYBRID 
                       order by a.YIELD')

 field_data_sql

 # left join  early and late (from above) on PLOT- note for each plot RR and BT are the 
 # same regardless of the hybrid  so specifying BT and RR for only one data set will suffice, 
 # but we need to join on PLOT  this seems to give a much more straightforward solution to the 
 # merge above using the R merge statement

 hybrids_sql = sqldf('
                    select  a.PLOT,a.GARST,a.PIO,
                    b.MYC, b.DEK, a.BT, a.RR
                    from early a left join late b
                    on a.PLOT =b.PLOT 
                    order by a.PLOT')

 hybrids_sql


 #-----------------------------------------------------
 # merging using dplyr
 #----------------------------------------------------

 tmp3 <- tmp2%>% left_join(tmp1, by = "State") 

 # get all months between EFFMTH and ENDMTH in cohort history file
 monthlist <- df1%>%left_join(select(tmp5,MBRID,Month,covmth), 
                             by = "MBRID")%>%filter(Month >= EFFMTH & Month <= ENDMTH)

 monthlist <- monthlist %>%arrange(MBRID,Month)


 # more complicated join wiht dplyr
 # merge claims data with cohort file keeping only claims between the coverage periods
 df3 <- df2%>%select(MBRID,EFFMTH,ENDMTH,INDEX_MTH_DT,pre12,post12,claimflag)%>%left_join(select(raw_claims,MBRID = PID,Claim_Number,SVC_FROM_DT,TOT_ALLOWED_AMT), 
                                                                                         by = "DMBRID")%>%filter(SVC_FROM_DT >= EFFMTH & SVC_FROM_DT <= ENDMTH)



 #  *-----------------------------------------------------------------*
 #  |
 #  |
 #  | 
 #  |  sorting data
 #  |
 #  |
 #  |
 #  *------------------------------------------------------------------*


 # use file plots2 weed pressure 

 # LOCATION TRAITS P1 P2 P3 P4
 #       1     RR  1  1  5  1
 #       2     RR  2  1  4  1
 #       1     RR  2  2  4  3
 #       2     RR  3  1 NA  3
 #       1     BT  4  5  2  4
 #       2     BT  5  4  5  5
 #       1     BT  5  3  4  4
 #       2     BT  4  5  5  5


 plots2<- read.table("plots2.txt", header =TRUE)
 plots2

 # sort data by location

 plotsSorted<-plots2[order(plots2$LOCATION),]
 plots2
 plotsSorted

 # sort by trait then location

 plots2Sorted<-plots2[order(plots2$TRAITS, plots2$LOCATION),]
 plots2Sorted

 # for descending order, prefix any variable with a minus sign

 plots2Sorted<-plots2[order(-plots2$LOCATION,plots2$TRAITS),]
 plots2Sorted

 # dplyr

 tmp <- arrange(tmp,desc(Freq))

 #  *-----------------------------------------------------------------*
 #  |                
 #  |
 #  |		 	
 #  |	graphics
 #  |
 #  |
 #  |
 #  *------------------------------------------------------------------*


 #  *------------------------------------------------------------------*
 #  | examples of r graphics capabilities
 #  *------------------------------------------------------------------* 

 demo(graphics)
 demo(persp)

 library(lattice)
 demo(lattice)

 #  *------------------------------------------------------------------*
 #  | basic plots
 #  *------------------------------------------------------------------*

 # histogram
 hist(yield_data$GARST)

 # bar plot
 barplot(yield_data$GARST, main="Garst Yields", xlab ="Plots",col =c("darkblue","red"), legend =rownames(yield_data$PLOT))

 # bar plot (example)
 counts <- mytable$Freq[mytable$Freq > 8]
 barplot(counts, main="Conditions of Members",
        xlab="Conditions", col=c("darkblue","red"),
        legend = mytable$Var1[mytable$Freq > 8], beside=False)

 # using xtabs to create plots
 plot(~xtabs(GARST ~ RR, data=yield_data)) # barplot
 plot(xtabs(~ BT, data = yield_data)) # barplot
 plot(xtabs(~ BT + RR, data = yield_data)) # mosaic plot

 # scatter plot
 plot(yield_data$GARST,yield_data$PIO)

 # scatterplot matrix of whole data set
 plot(yield_data)

 # scatterplot matrix- data used in regression in analysis section
 land_prices<- read.table("land_prices.txt", header =TRUE)
 land_prices

 plot(land_prices)

 # brushed scatterplot - as in model visualization
 ggplot(df1,aes(x=SUMRX_NONLTOT_QB_GE120MEDD,y=SUMRX_LTOT_QB_GE120MEDD, shape = risk,color = risk)) +
  scale_color_manual(values=c("green","yellow", "red","red")) + geom_point()

 # labels + jittering
 set.seed(456)
 ggplot(df1,aes(x=SUMRX_NONLTOT_QB_GE120MEDD,y=SUMRX_LTOT_QB_GE120MEDD, shape = risk,color = risk)) +
  scale_color_manual(values=c("green","yellow", "red","red")) + geom_point() + geom_jitter(width = 2, height =2) +
  xlab("Total Non-Qualified >= 120") +
  ylab("Total Qualified >= 120") +
  ggtitle("Observed Rx Counts Overlaid by Predicted Risk/Stratification")



 #  *------------------------------------------------------------------*
 #  | stacked bar chart
 #  *------------------------------------------------------------------*

 # get data
 bytrait<- read.table("biotech_yield.txt", header =TRUE)
 bytrait
 names(bytrait)

 # summarize data for stacking by trait 
 counts <- table(bytrait$RR, bytrait$YIELD) # list stacked variable first
 counts

 barplot(counts, main ="Yield by Trait for Garst",xlab ="Yield Category", col =c("darkblue","red"),legend =rownames(counts))


 #--------------------------------------------
 # pie and bar charts in ggplot
 #--------------------------------------------

 table(df$pre_term) # get values by group

 tmp <- data.frame(
  group = c("Full-Term", "Pre-Term"),
  value = c(7999,6000)
 )

 # stacked bar
 bp <- ggplot(tmp, aes(x="", y=value, fill=group))+
  geom_bar(width = 1, stat = "identity")

 bp

 # pie
 pie <- bp + coord_polar("y", start=0)
 pie

 pie + scale_fill_manual(values=c("red","blue"))



 #  *------------------------------------------------------------------*
 #  | basic density plots
 #  *------------------------------------------------------------------*


 # plot and save histogram data 
 garsths <- hist(yield_data$GARST, main = "Garst Yield Histogram")
 print(garsths)

 # compute density estimates using default gaussian & rule of thumb bandwidth
 garstdens <- density(x=yield_data$GARST, bw="nrd0",kernel="gaussian",n=20)
 print(garstdens)

 # rescale density so it will plot on the same graph as GARST histogram
 rs <- max(garsths$counts/max(garstdens$y)) # create scaling factor for plotting the density
 lines(garstdens$x, garstdens$y*rs, col=2) # plot density over histogram using lines statment

 # compare this to just plotting the non-rescaled density over the histogram
 lines(density(x=yield_data$GARST, bw="nrd0", n=20),col=3) # green line barely shows up

 # turn off/close graphics window
 dev.off()


 #  *------------------------------------------------------------------*
 #  | overlapping density plots
 #  *------------------------------------------------------------------*


 library(colorspace) # package for rainbow_hcl function

 # Generate just the data for a histogram of GARST grouping by RR trait

 ds <- rbind(data.frame(dat=yield_data[,][,"GARST"], grp="All"),
            data.frame(dat=yield_data[,][yield_data$RR=="N","GARST"], grp="N"),
            data.frame(dat=yield_data[,][yield_data$RR=="Y","GARST"], grp="Y"))

 # histogram for all GARST data			
 hs <- hist(ds[ds$grp=="All",1], main="", xlab="GARST", col="grey90", ylim=c(0, 9.46395633979357), breaks="fd", border=TRUE)

 # compute density, rescale, and plot for all GARST data
 dens <- density(ds[ds$grp=="All",1], na.rm=TRUE)
 rs <- max(hs$counts)/max(dens$y)
 lines(dens$x, dens$y*rs, type="l", col=rainbow_hcl(3)[1])

 # compute density, rescale, and plot where RR = 'N'
 dens <- density(ds[ds$grp=="N",1], na.rm=TRUE)
 rs <- max(hs$counts)/max(dens$y)
 lines(dens$x, dens$y*rs, type="l", col=rainbow_hcl(3)[2])

 # compute density, rescale, and plot where RR = 'Y'
 dens <- density(ds[ds$grp=="Y",1], na.rm=TRUE)
 rs <- max(hs$counts)/max(dens$y)
 lines(dens$x, dens$y*rs, type="l", col=rainbow_hcl(3)[3])

 # Add a rug to illustrate density

 rug(ds[ds$grp=="N", 1], col=rainbow_hcl(3)[2])
 rug(ds[ds$grp=="Y", 1], col=rainbow_hcl(3)[3])

 # Add a legend to the plot.

 legend("topright", c("All", "N", "Y"), bty="n", fill=rainbow_hcl(3))

 # Add a title to the plot.

 title(main="Distribution of GARST
      by RR", sub=paste("Created Using R Statistical Package", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))

 ### pie chart

 table(tmp$Enrollee) # get counts
 slices <- c(3111,14141) # define slices
 lbls <- c("True","False") # slice labels
 pct <- round(slices/sum(slices)*100,digits =2) # calc pct
 lbls <- paste(lbls, pct) # add calculated percents to labels 
 lbls <- paste(lbls,"%",sep="") # ad % to labels 
 pie(slices,labels = lbls, col=rainbow(length(lbls)),
    main="Members")

 ### aggregate time series and produce multiple line chart (dply & ggplot2)

 # subset only top 5 departmetns for OT Hrs
 tmp2<- 
  filter(df2,Role=="Specialist"|Role=="Finder"|Role=="Customer Care Specialist"|Role =="Clinical"|Role=="Content Specialist")

 # aggregate by department and date
 tmp3 <- tmp2%>%
  group_by(Role,End_Date) %>%
  summarize(TotalOTHrs = sum(Overtime_Hours))

 ggplot(tmp3,aes(End_Date,TotalOTHrs,colour=Role)) + geom_line() + geom_point()

 ### aggregate and simple plot
 # aggregate & visualize total OT by pay period
 tmp <- df2%>%
  group_by(End_Date) %>%
  summarize(TotalOTHrs = sum(Overtime_Hours))

 plot(tmp$End_Date,tmp$TotalOTHrs,type="b", xlab = "End Date", ylab = "Total OT Hrs")

 #------------------------------------------------
 # side by side plots
 #------------------------------------------------

 par(mfrow=c(1,2))    # set the plotting area into a 1*2 array
 hist(treat,breaks = 50)
 hist(ctrl, breaks = 50)


 #  *-----------------------------------------------------------------*
 #  |
 #  |
 #  |
 #  |     analysis
 #  |
 #  |
 #  |
 #  *------------------------------------------------------------------*


 #  *------------------------------------------------------------------*
 #  | t-tests
 #  *------------------------------------------------------------------* 


 # independent samples t-test

 t.test(yield_data$GARST,yield_data$PIO ) # difference in GARST vs PIO yields from yield_data file

 t.test(plots2$P4[plots2$TRAITS=='RR'],plots2$P4[plots2$TRAITS=='BT'] ) # difference in weed pressure for bt and rr traits in plot 4


 # paried t-test

 t.test(plots2$P1,plots2$P2,paired=TRUE) # differences in overall weed pressure for plots 1 and 2 (more appropriately used on same subjects- before and after tests)
 # example if plot 1 was before herbicide application while plot 2 was the same plot measured for weed 
 pressure after application

 t.test(yield_data$GARST,yield_data$PIO,paired=TRUE) # not actually appropriate but notice the difference in the results from the previous t-test on 						
 # GARST and PIO


 #  *------------------------------------------------------------------*
 #  | measures of association
 #  *------------------------------------------------------------------* 


 # pearson correlations - no sig tests

 rcorr(cbind(plots2$P1,plots2$P2,plots2$P3,plots2$P4)) # gives pearson correlations

 # the cor.test has p-value, ci, but only 2 vars at a time

 cor.test(plots2$P1,plots2$P2,use="pairwise")

 # spearman correlations using the hmisc rcorr function

 rcorr( cbind(plots2$P1,plots2$P2,plots2$P3,plots2$P4), type='spearman')

 rcorr( cbind(plots2$P1,plots2$P2,plots2$P3,plots2$P4), type='pearson') # compare, also same as defalt


 #  *------------------------------------------------------------------*
 #  | simple linear regression
 #  *------------------------------------------------------------------* 


 # lets use another data set land_prices

 # PRICE	ACRES IMPROVEMENTS
 # 36	9	  8
 # 80	15	  7
 # 44	10	  9
 # 55	11	  10
 # 35	10	  6

 land_prices<- read.table("land_prices.txt", header =TRUE)
 land_prices

 plot(land_prices) # scatterplot of dependent and independent variables

 myRegModel<-lm(PRICE~ACRES,data=land_prices) # run regression

 summary(myRegModel) # view estimates

 anova(myRegModel) 

 # simple plots- plot data and regresion line

 plot(land_prices$ACRES,land_prices$PRICE)
 abline(myRegModel)

 # automated plots

 plot(myRegModel)

 termplot(myRegModel)


 #  *------------------------------------------------------------------*
 #  | multipe linear regression
 #  *------------------------------------------------------------------* 


 myRegModel<-lm(PRICE~ACRES+IMPROVEMENTS,data=land_prices) # run regression

 summary(myRegModel) # view estimates

 anova(myRegModel) 

 plot(myRegModel)

 termplot(myRegModel)


 #  *------------------------------------------------------------------*
 #  | analysis of variance
 #  *------------------------------------------------------------------* 


 # look at data set 'yield_data'

 yield_data<-read.table("Yield_plots.txt", header=TRUE) # read text file
 print(yield_data) # print data set

 # data looks like this:
 # GARST	PIO	MYC	DEK	PLOT BT	RR
 # 150	160	137	150	1	 Y	Y
 # 140	150	148	149	2	 Y	N
 # 145	146	151	145	3	 N	Y
 # 137	138	139	140	4	 N	N
 # 141	142	143	144	5	 N	N	
 # 145	146	120	148	6	 N	N
 # 149	150	115	152	7	 Y	N
 # 153	154	136	156	8	 N	?
 # 157	158	130	160	9	 Y	Y
 # 161	162	129	164	10	 Y	N


 # we would like to set up a design such as this:
 # let treatments = hybrid, blocks = plot 

 #------------------------------------------------#
 #		HYBRID

 #		GARST	PIO	MYC	DEK

 # PLOT 1	150	137	160	150

 # PLOT 2	140	148	150	149

 # PLOT 3	145	151	146	145
 #------------------------------------------------#


 # actual data set will need to look like this 'yield_trials'

 # HYBRID PLOT YIELD
 # GARST	 P1	  150
 # GARST	 P2	  140
 # GARST	 P3	  145
 # PIO	 P1	 137
 # PIO	 P2	 148
 # PIO	 P3	 151
 # MYC	 P1	 160
 # MYC	 P2	 150
 # MYC	 P3	 146
 # DEK	 P1	 150
 # DEK	 P2	 149
 # DEK	 P3	 145


 yield_trials<-read.table("yield_trials.txt", header=TRUE) # read text file
 print(yield_trials) # print data set

 #  *------------------------------------------------------------------*
 #  | 2 WAY AOV
 #  *------------------------------------------------------------------* 


 myModel<-aov(YIELD~HYBRID+PLOT,data=yield_trials)

 summary(myModel)
 anova(myModel) # should give same results as above
 plot(myModel)
 termplot(myModel)

 #--------------------------------------------------
 # getting marginal/adjusted/least squares means from multivariable models
 #--------------------------------------------------

 ### example linear regression 

 summary(m1 <- lm(Post_ER_Admit ~  Control_Study_Flag + CaseOpenReasonDesc + Diag_Grouping + SourceFlag + Gender_Code + LowIncome + RuralZips + Pre_ER_Admit + AcutePre + Length_Stay_Pre + PrimaryCareVisitCount + Cancer_abnormal_cervical +  Asthma + Pulmonary_Disease + HTN + Heart_Disease + Diabetes + Obesity + Depression + HIV + Other_Mental_Disorders + RiskLvl + OtherProgramFlag + Total_Bene + Total_TEPRV_ID + Total_Hospital_Beds + Total_Hospitals +BranchDesc + SponsorRankDesc + AgeCat, data = df_chrt))

 # b_trt = 1.03 more ER visits in the treatment group compared to .64 fewer in the matched analysis

 # use emmeans package to calculate 'marginal means' or adjusted means

 library(emmeans)
 emmeans (m1,  ~ Control_Study_Flag) # 4.81-3.77 = 1.04 which is ~ what we get directly from the regression above

 ### example negative binomial model

 summary(m1 <- glm.nb(Post_ER_Admit ~ Control_Study_Flag  + CaseOpenReasonDesc + Diag_Grouping + SourceFlag + Gender_Code + LowIncome + RuralZips +Pre_ER_Admit + AcutePre + Length_Stay_Pre + PrimaryCareVisitCount + Cancer_abnormal_cervical +  Asthma + Pulmonary_Disease + HTN + Heart_Disease + Diabetes + Obesity + Depression +HIV + Other_Mental_Disorders + RiskLvl + OtherProgramFlag +Total_Bene + Total_TEPRV_ID + Total_Hospital_Beds + Total_Hospitals + BranchDesc + SponsorRankDesc + AgeCat, control = glm.control(maxit = 500),data = df_chrt))

 exp(0.184) # IRR = 1.202 treatment group has ~  20% more ER visits than controls

 m1.mmeans <- emmeans(m1,  ~ Control_Study_Flag) # estimate marginal means
 summary(m1.mmeans, infer = TRUE, type = 'response') # summary with backtransformation to reflect original scale

 (.0634/.0527) # this gives a ratio of marginal means = 1.203 which is consistent with the exponentiated result directly from the model


 #  *-----------------------------------------------------------------*
 #  |
 #  |
 #  |
 #  |  labels and formats
 #  |
 #  |
 #  |
 #  *------------------------------------------------------------------*


 #-----------------------------------------------------------------#

 # value labels or formats

 #-----------------------------------------------------------------#


 ##
 ## example 1: recoding
 ##

 # change location from 'numeric' into a 'factor' with character values
 # this is not actually creating labels, bt actually changing the numeric values
 # of LOCATION to more meaningful character values such as "N" for north and "S" for south

 plots2$LOCATION <- factor(plots2$LOCATION,levels=c(1,2,3,4),labels=c("N", "S", "E", "W"))
 plots2


 ##
 ## example 2: creating new variables to act as labels for values
 ##


 # lets create new variales to act as labels for insect pressure levels 1-5 
 # instead of changing the numeric values for P1-P4, we are creating a new
 # set of character variables to describe P1-P4

 myPlevels<-c(1,2,3,4,5) # set the # of levels

 myPlabels <-c("Very Heavy","Heavy","Moderate","Light","Very Light") #  labels 

 # now create a new set of variables that describe P1-P4

 plots2$P1f <- factor(plots2$P1, myPlevels, myPlabels)
 plots2$P2f <- factor(plots2$P2, myPlevels, myPlabels)
 plots2$P3f <- factor(plots2$P3, myPlevels, myPlabels)
 plots2$P4f <- factor(plots2$P4, myPlevels, myPlabels)

 plots2

 # Get summary and see that LOCATIONS are now counted.

 summary( plots2[ c("P1f","P2f","P3f","P4f") ] )


 ##
 ## note we could have just renamed these values just as we did for 
 ## location in the first example
 ##

 plots2b <-  read.table("plots2.txt", header =TRUE) # copy of plots2 for this example
 plots2b

 # note we are re-using pre-defined labels from above so those statements
 # are not repeated here

 plots2b$P1 <- factor(plots2b$P1, myPlevels, myPlabels)
 plots2b$P2 <- factor(plots2b$P2, myPlevels, myPlabels)
 plots2b$P3 <- factor(plots2b$P3, myPlevels, myPlabels)
 plots2b$P4 <- factor(plots2b$P4, myPlevels, myPlabels)

 plots2b


 #-----------------------------------------------------------#

 # variable labels 

 #-----------------------------------------------------------#

 # note in the examples above we were concerned with the formatting
 # the values a particular variable could take, such as values N, S 
 # for the variable LOCATION or the value "Very Heavy" for the variables P1-P4
 # here we are looking at changing the name or appearance of the variables themselves
 # vs the values they may take

 library(Hmisc)

 # start fresh with the origial unformatted data from plots2

 plots2<- read.table("plots2.txt", header =TRUE)
 plots2

 # assign variable names to act as labels - lets assume we 
 # want vriables p1-p4 to appear as plot1 - plot4
 # this example 'pastes' new names over the old ones or 
 # simply changes the variable names

 # note the reference [3:6] simply identifies the 3rd-6th variable names
 # in the data frame for processing. 

 names(plots2)[3:6] <- c(
  "PLOT 1",
  "PLOT 2",
  "PLOT 3",
  "PLOT 4")
 names(plots2) # verfy varable names have been changed


 #-------------------------------------------------------#

 #  complete example for varaible name and value labels

 #-------------------------------------------------------#

 # utilize all of the methods above to re-format a data set
 # suppose again we start off with a data set such as plots2

 # LOCATION TRAITS P1 P2 P3 P4
 #       1     RR  1  1  5  1
 #       2     RR  2  1  4  1
 #       1     RR  2  2  4  3
 #       2     RR  3  1 NA  3
 #       1     BT  4  5  2  4
 #       2     BT  5  4  5  5
 #       1     BT  5  3  4  4
 #       2     BT  4  5  5  5

 # start fresh with the origial unformatted data from plots2

 plots2<- read.table("plots2.txt", header =TRUE)
 plots2


 # suppose we want the the values for the variable LOCATION to be N for LOCATON =1 and 
 # S for LOCATION= 2. We also want the values for the variabes P1-P4 to be 
 # 1 = "Very Heavy" 2 ="Heavy" 3 = "Moderate" 4 = "Light" 5 ="Very Light"
 # and we want the variabe names P1-P4 to be "PLOT 1" - "PLOT 4"

 # STEP 1: change values for variable LOCATION

 plots2$LOCATION <- factor(plots2$LOCATION,levels=c(1,2,3,4),labels=c("N", "S", "E", "W"))

 # STEP 2: change values for the variables P1-P4

 myPlevels<-c(1,2,3,4,5) # set the # of levels

 myPlabels <-c("Very Heavy","Heavy","Moderate","Light","Very Light") #  labels 


 plots2$P1 <- factor(plots2$P1, myPlevels, myPlabels) # note we are actally changing the values for P1-P4
 plots2$P2 <- factor(plots2$P2, myPlevels, myPlabels) # vs creating varables to describe the values
 plots2$P3 <- factor(plots2$P3, myPlevels, myPlabels)
 plots2$P4 <- factor(plots2$P4, myPlevels, myPlabels)

 # STEP 3: change the VARIABLE labels for P1-P4

 names(plots2)[3:6] <- c(
  "PLOT 1",
  "PLOT 2",
  "PLOT 3",
  "PLOT 4")

 # print re-formatted data set 

 plots2


 # functions

 # create function myvar.port that iterates throught the weights and calculates portfolio variance

 myvar.port <- function(nWeights) {
  sigma.z <- matrix(c(0), nrow = nWeights) #initialize - we will compute 11 standard deviations
  for (i in 1:nWeights){
    sigma.z <- sqrt((Wa)^2*(std.a)^2 + (Wb)^2*std.b^2 + 2*p.ab*Wa*Wb*std.a*std.b)
  }
  return (sigma.z)
  
 }


 # example: produce summary statistics for each category in a field 

 cats <- as.vector(tmp$Var1) # extract measure ids as a list

 for(i in 1:19) {
  
  print(cats[i]) 
  print(summary(complications$Score[complications$Measure_ID ==cats[i]]))
  
 }


 #--------------------------------------------------
 # dealing with scientific notation
 #--------------------------------------------------


 x <- 1.82*(10^-7) 
 y <- format(x, scientific = FALSE)


 # or even better:

 options(scipen = 999)


 #----------------------------------------------------
 # PROPENSITY SCORE MATCHING
 #---------------------------------------------------

 ######################################################
 # example from MatchIt documentation
 #####################################################
 # example data set is a subset of the job training program analyzed in Lalonde (1986) and Dehejia and Wahba (1999). 
 # MatchIt includes a subsample of the original data con- sisting of the National Supported Work Demonstration (NSW) 
 # treated group and the comparison sample from the Population Survey of Income Dynamics (PSID).1 The variables in this data 
 # set include participation in the job training program (treat, which is equal to 1 if participated in the program, and 0 otherwise),
 # age (age), years of education (educ), race (black which is equal to 1 if black, and 0 otherwise; 
 # hispan which is equal to 1 if hispanic, and 0 otherwise), marital status (married, which is equal to 1 if married, 0 otherwise),
 # high school degree (nodegree, which is equal to 1 if no degree, 0 otherwise), 1974 real earnings (re74), 1975 real earnings
 #  (re75), and the main outcome variable, 1978 real earnings (re78)

 head(lalonde)
 dim(lalonde)
 names(lalonde)
 #  "treat"    "age"      "educ"     "black"    "hispan"   "married"  "nodegree" "re74"     "re75"     "re78"   

 ####################################################
 # nearest neighbor matching
 ####################################################

 # Matching is done using a distance measure specified by the distance option (default=logit). 
 # Matches are chosen for each treated unit one at a time, with the order specified by the m.order command (default=largest to smallest).
 #  At each matching step we choose the control unit that is not yet matched but is closest to the treated unit on the distance measure.

 m.out1 <- matchit(treat ~ re74 + re75 + age + educ, data = lalonde, method = "nearest", distance = "logit")

 summary(m.out1) # check balance


 m.data1 <- match.data(m.out1,distance ="pscore") # create ps matched data set

 head(m.data1) # view

 # perform paired t-tests (not in documentation)
 t.test(m.data1$re78[m.data1$treat==1],m.data1$re78[m.data1$treat==0],paired=TRUE)
 t.test(lalonde$re78[lalonde$treat==1],lalonde$re78[lalonde$treat==0],paired = FALSE)


 # export data to compare in SAS or perform additional analysis

 write.csv(lalonde, file = "//Documents//Briefcase///Data//lalonde.csv")

 # ex 

 write.csv(w1, file = "r goals.csv") # assumes assigned directory or default directory
 write.csv(m.data1, file = "//Documents//Briefcase///Data//lalonde_nearest.csv")

 # ex without rownames
 write.csv(opioid_dat, file = "//Documents//Briefcase///Data//opioid_dat.csv", row.names = FALSE)



 #---------------------------------
 # working with dates
 #--------------------------------

 # calculate difference in dates

 # ex 1
 survey$date_diff <- as.Date(as.character(survey$date), format="%Y/%m/%d")-as.Date(as.character(survey$tx_start), format="%Y/%m/%d")
 survey

 # ex 2
 df$diff_in_days<- difftime(df$datevar1 ,df$datevar2 , units = c("days"))


 # application
 temp3_dates$diff_days <- abs((as.Date(as.character(temp3_dates$Measure_Start_Date), format="%m/%d/%Y")-as.Date(as.character(temp3_dates$Measure_End_Date), format="%m/%d/%Y")))

 # or
 temp3_dates$diff_days <- abs((difftime(as.Date(as.character(temp3_dates$Measure_Start_Date), format="%m/%d/%Y"),as.Date(as.character(temp3_dates$Measure_End_Date), format="%m/%d/%Y") , units = c("days"))))


 # create date format
 df1$CovEffDate <- as.Date(df1$Coverage_Effective_Date, "%Y-%m-%d")

 # sometimes you need to supply the origin if your data is numeric
 as_datetime(y, origin = lubridate::origin)

 # I have also seen
 as.Date(17536,origin="1970-01-01")

 # get min and max dates
 tmp <- df1%>%
  summarize(mindate = min(CovEffDate, na.rm=TRUE),
            maxdate = max(CovEffDate, na.rm=TRUE))

 ## first days of years
 seq(as.Date("1910/1/1"), as.Date("1999/1/1"), "years")
 ## by month
 seq(as.Date("2000/1/1"), by = "month", length.out = 12)
 ## quarters
 seq(as.Date("2000/1/1"), as.Date("2003/1/1"), by = "quarter")

 ## find all 7th of the month between two dates, the last being a 7th.
 st <- as.Date("1998-12-17")
 en <- as.Date("2000-1-7")
 ll <- seq(en, st, by = "-1 month")
 rev(ll[ll > st & ll < en])

 # application: calculate age at index date for an observational study
 df1$BirthDate <- as.character(df1$DOB) # format as character
 df1$BirthDate <- as.Date(df1$BirthDate, "%Y-%m-%d") # format as date
 df1$age <- round(difftime(df1$index_date,df1$BirthDate, units = c("days"))/(365.25)) # calculate age
 df1$age <- as.numeric(df1$age) # make sure this is numeric

 # calculate age using libridate function
 df1$BirthDate <- as.character(df1$DOB) # format as character
 df1$BirthDate <- mdy(df1$BirthDate) # apply date format (lubridate function)
 df1$age <- round(difftime(df1$index_date,df1$BirthDate, units = c("days"))/(365.25)) # calculate age
 df1$age <- as.numeric(df1$age) # make numeric

 #----------------------------
 # sample training and validation
 #---------------------------

 # store total number of observations in your data
 N <- 400 
 print(N)

 # Number of training observations
 Ntrain <- N * 0.5
 print(Ntrain)

 # add an explicit row number variable for tracking

 id <- seq(1,400)

 apps2 <- cbind(apps,id)

 # Randomly arrange the data and divide it into a training
 # and test set.

 dat <- apps2[sample(1:N),]
 train <- dat[1:Ntrain,]
 validate <- dat[(Ntrain+1):N,]

 dim(dat)
 dim(train)
 dim(validate)

 # sort and look at data sets to see that they are different

 sort_train <- train[order(train$id),]
 print(sort_train)

 sort_val <- validate[order(validate$id),]
 print(sort_val)



 # random sample
 randomSample = function(df,n) { 
  return (df[sample(nrow(df), n),])
 }

 rs <- randomSample(tmp,5)



 # random sample (with set.seed for repeatability)
 randomSample = function(df,n) { 
  set.seed(123)
  return (df[sample(nrow(df), n),])
 }

 randomSample(tmp,5) # first call
 randomSample(tmp,5) # second call

 #-----------------------------------
 #  bootstrap
 #-----------------------------------


 # regression 
 summary(fit <- lm(GARST ~ PIO, data = yield_data))

 # boot strap for regression (this may not be accurate with such small sample)

 bstrap <- c()
   for (i in 1:10000) {
   newsample <- yield_data[sample(nrow(yield_data), 5, replace = T), ]
   bstrap <- c(bstrap, as.vector(coef(lm(GARST ~ PIO,newsample))[2])) }
 hist(bstrap)
 summary(bstrap)

 sd(bstrap, na.rm = TRUE)  # SE
 quantile(bstrap, c(.025,.975),na.rm = TRUE) 



 #------------------------------------
 # transpose or reshape data
 #------------------------------------

 # reference: https://stats.idre.ucla.edu/r/faq/how-can-i-reshape-my-data-in-r/

 # create example data
 id <- c(1,1,1,2,2,2,3,3,3)
 measure <- c("depth","temp","width","depth","temp","width","depth","temp","width")
 values <- c(2,50,18,1.5,53,18,2.5,60,18)
 dat <- data.frame(id,measure,values) 

 # transpose panel data to wide format
 datwide <- reshape(dat, 
                   timevar = "measure", # the panel variable(s) we want to be new columns
                   idvar = c("id"), # vars we want to keep constant
                   direction = "wide")

 # collapse wide data into panel
 datnarrow <- reshape(datwide, 
                     varying = c("depth","temp","width"), # things we want to collapse into a single column
                     v.names = "value", # name for new column that will hold panel of values
                     timevar = "measure",  # name for new column that collapses old variable names into values
                     times = c("depth","temp","width"), # old variable names that become new values
                     new.row.names = 1:1000,
                     direction = "long")

 names(datwide) <- gsub("values.", "", names(datwide )) # cleanup names


 #----------------------------
 # check duplicates
 #---------------------------



 # check duplication
 tmp <- data.frame(table(temp1_stars$Provider_ID))
 summary(tmp)


 # remove duplicates or get distinct

 DEK <- c(150,150,145,140,144,148,152,156,160,164)
 PLOT <- c(1,1,1,1,5,6,7,8,9,10)
 BT <- c('Y','Y','Y','N','N','N','Y','N','Y','Y')

 dat <- data.frame(DEK,PLOT,BT)

 # how many unique values of BT?
 length(levels(dat$BT))

 # how many unique plot values
 length(unique(dat$PLOT))

 # this data is duplicated in many ways
 deduped.dat <- unique(dat[,1:3 ] ) # gets unique based on all 3 fields

 # dplyr offers several options
 deduped.dat <- dat%>%distinct(PLOT, .keep_all = TRUE) # based on PLOT
 deduped.dat <- dat%>%distinct(PLOT,BT, .keep_all = TRUE) # based on PLOT and BT
 deduped.dat <- dat%>%distinct(PLOT,BT,DEK, .keep_all = TRUE) # truly unique based on all listed fields
 deduped.dat <- dat%>% distinct # same as above without listing fields related to duplication


 # get distinct instance based on ID and pre term flag = 0
 tmp1 <- tmp1_births%>%arrange(MBR_ID,pre_term)%>%distinct(MBR_ID, .keep_all = TRUE)
 tmp2 <- data.frame(table(tmp1$MBR_ID)) 
 tmp4 <- tmp1[tmp1$MBR_ID =='10060',]

 # get distinct last instance based on ID and pre term flag = 1
 tmp1 <- tmp1_births%>%arrange(MBR_ID,desc(pre_term))%>%distinct(MBR_ID, .keep_all = TRUE)
 tmp2 <- data.frame(table(tmp1$MBR_ID)) 
 tmp4 <- tmp1[tmp1$MBR_ID =='10060',] 

 # correcting plot boundaries
 par(mar=c(1,1,1,1))


 # example routing checking and correcting duplicates based on key and 1 addiitonal field

 # check duplicates
 tmp1 <- data.frame(table(df2$MBR_ID)) # find duplicates
 tmp <- df2[df2$MBR_ID =='10060',] # appears duplicated on diagnosis &/or claim #

 # get distinct sorting to keep most recent diagnosis (judgement call)
 tmp2 <- df2%>%arrange(MBR_ID,desc(SVC_FROM_DT))%>%distinct(MBR_ID, .keep_all = TRUE)
 tmp3 <- tmp2[tmp2$MBR_ID =='10060',] # check

 # update data frame
 df2 <- tmp2

 rm(tmp,tmp1,tmp2,tmp3) # cleanup

 # example referencing against a list that is a random sample of numeric IDs
 # note: change formatting in 'vals' if character IDs are required

 # check duplicates
 tmp1 <- data.frame(table(df5$ID)) # find duplicates
 tmp <- df5[df5$ID =='80050',] # appears duplicated on flag #

 # check against a random sample of duplictes
 rs <- randomSample(tmp1[tmp1$Freq > 1,],5) # look at random sample of duplicates
 vals <- as.list((as.numeric(as.character(rs$Var1)))) # get ID as list from rs
 tmp <- df5[df5$ID %in% vals,] # subset data with duplicated rows - check


 # get distinct sorting to give priority to pre-term flag = 1 (judgement call)
 tmp2 <- df5%>%arrange(ID,desc(pre_term))%>%distinct(ID, .keep_all = TRUE)
 tmp3 <- tmp2[tmp2$ID %in% vals,] # check again - all pre_term = 1 obs should be kept

 # update data frame
 df5 <- tmp2

 rm(tmp,tmp1,tmp2,tmp3,rs,vals) # cleanup


 #-----------------------------
 # missing values
 #-----------------------------

 # count total missing values in a data frame
 colSums(is.na(df1_chrt))

 ### example recode missing category
 df1$Lvl <- ifelse(is.na(df1$Level) == TRUE,'1-Low-Risk',df1$Lvl)

 ### example - categorizing with a missing value category

 df1$riskcat <- ifelse(df1$Risk_Score <= 3,'1-Low',
                        ifelse(df1$Risk_Score > 3 &  df1$Risk_Score <= 5, '2-Moderate', '3-High'))

 # account for NA group
 df1$riskcat <- ifelse(is.na(df1$Risk_Score) == TRUE,'0-NA',df1$riskcat)

 # remove missing based on specified field
 tmp3 <- tmp3[is.na(tmp3$RUCC_2013) == 'FALSE',] 
 tmp2 <- na.omit(tmp1) # listwise deletion of all missing

 # basic imputation

 impute.mean <- function(x) {
  z <- mean(x, na.rm = TRUE)
  x[is.na(x)] <- z
  return(x)
 }

 apply(df$xvar,2,impute.mean)

 impute.zero <- function(x) {
 x[is.na(x)] <- 0
 return(x)
 }
         
 df1 <- data.frame(apply(df,2,impute.zero)) # this imputes all missing column values to zero

 #--------------------------
 # loop processing
 #-------------------------


 # loop across fields and produce frequencies
 results=list()  # create list for storing results

 # run loop and simulate data
 for(i in 1:5) {
  results[i] = sample(1:15, 1) 
 }

 x <- as.numeric(as.character(unlist(results))) # convert results to object
 summary(x) # analysis

 #--------------------------------------
 # working with functins
 #--------------------------------------

 apply(df,2,sd)  # calculate standard deviation for every column in data set

 df2 <- sapply(df1,as.numeric) # convert all columns to numeric





 #-----------------------------------
 # substring text and character operations
 #-----------------------------------

 # if ENDT still NULL and there is an coverage effective date then replace null end date with today's date
 df1$ENDT <- ifelse((substr(df1$ENDT,1,1) =="N" & (substr(df1$DateVar,1,1) != "N")),as.character.Date("2018-10-14"),df1$ENDT)

 # remove first character
 df1$ID <- substring(df$ID, 2)

 # remove last character
 df1$ID <- substring(df1$ID,1,nchar(df1$ID)-1) # last quote