Created
February 12, 2012 00:13
-
-
Save ajdamico/1805260 to your computer and use it in GitHub Desktop.
replicate the national health interview survey's multiply imputed income technique using R instead of SUDAAN
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#page 118 of the NHIS document | |
#ftp://ftp.cdc.gov/pub/health_statistics/nchs/dataset_documentation/nhis/2010/srvydesc.pdf | |
#displays the R code to load the persons file into R as a survey object | |
#the code below creates a slightly different survey object, one that includes appropriately-imputed income. | |
#this R code: | |
# reads the year 2000 personsx file into R | |
# reads in all five imputed income files | |
# merges the 2000 personsx file with the five imputed income files | |
# creates a MI survey object | |
# runs six example analyses with the multiply-imputed R survey object | |
#set the number of digits displayed | |
options(digits=16) | |
#NOTE that the three packages below must be installed (just once) | |
#if this is the first time using R with these packages, run this line: | |
#install.packages(c("mitools","survey","sas7bdat")) | |
#load the multiple imputation package | |
library(mitools) | |
#load the complex survey analysis package | |
library(survey) | |
#set the current working directory to the location where the 2000 personsx SAS data set is stored. | |
setwd("//dcdata/v/National Health Interview Survey/Income Imputation/2000") | |
#load the .sas7bdat importation package | |
library(sas7bdat) | |
############################### | |
#import NHIS 2000 PERSONSX file | |
#this can be done a number of different ways | |
#read in the NHIS 2000 PERSONSX file from a SAS file (this takes a while) | |
x <- read.sas7bdat( "personsx.sas7bdat" ) | |
#now the NHIS 2000 PERSONSX file is a data frame (x) stored in R! | |
#if using read.sas7bdat, these four columns will be factor variables | |
#and must be converted to numeric ones in order for the merge to work | |
for ( i in c("SRVY_YR","HHX","FMX","PX") ){ | |
x[ , i ] <- as.numeric( as.character( x[ , i ] ) ) | |
} | |
#end import of PERSONSX file | |
############################### | |
#loop through j = 1 through 5 and.. | |
#read in the five imputed income files. (this also takes a while) | |
for ( j in 1:5 ){ | |
#print the current iteration of the loop | |
print( j ) | |
#set the current working directory to the location where the five 2000 imputed income SAS data sets are stored. | |
setwd("//dcdata/v/National Health Interview Survey/Income Imputation/2000") | |
#read in the SAS imputed income file number "j" | |
y <- read.sas7bdat(paste( "incmimp" , j , ".sas7bdat", sep="")) | |
#dump RECTYPE variable, which is already on the PERSONSX file | |
y$RECTYPE <- NULL | |
#merge the personsx file with the imputed income file | |
z <- merge( x , y , by.x=c("SRVY_YR","HHX","FMX","PX") , by.y=c("SRVY_YR","HHX","FMX","FPX") ) | |
#print the number of rows in the personsx data frame and | |
#also the number of rows in the merged personsx-imputed income data frame | |
#to make sure they are the same! | |
print( nrow( x ) ) | |
print( nrow( y ) ) | |
print( nrow( z ) ) | |
########################### | |
#START OF VARIABLE RECODING | |
#any new variables that the user would like to create should be constructed here | |
#create the NOTCOV variable | |
#shown on page 47 (PDF page 51) of http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf | |
z <- transform( z , NOTCOV = ifelse( NOTCOV %in% 7:9 , NA , NOTCOV )) | |
#create the POVERTYI variable | |
#shown on page 48 (PDF page 52) of http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf | |
z <- transform( z , POVERTYI = | |
ifelse( RAT_CATI %in% 1:3 , 1 , | |
ifelse( RAT_CATI %in% 4:7 , 2 , | |
ifelse( RAT_CATI %in% 8:11 , 3 , | |
ifelse( RAT_CATI %in% 12:14 , 4 , NA ) ) ) ) ) | |
#END OF VARIABLE RECODING | |
######################### | |
######################### | |
#delete columns you don't need to free up RAM | |
mini_z <- z[ , c("PSU","STRATUM","WTFA","POVERTYI","NOTCOV") ] | |
#end of column deletions | |
######################### | |
#save the current data frame (z) as z1, z2, z3, z4, or z5 | |
#depending on the current iteration of the j = 1 through 5 loop | |
assign( paste( "z" , j , sep = "" ) , mini_z ) | |
#delete the y and z data frames to free up RAM | |
y <- z <- NULL | |
#garbage collection: this frees up RAM | |
gc() | |
} | |
#when the loop has terminated, data frames z1 through z5 exist | |
#each are the personsx file merged with one of the five imputed income files | |
#and each include all recoded variables. | |
#using all five merged personsx-MI files, | |
#create the multiple imputation survey object | |
nhissvy <- svydesign( id = ~PSU , strata=~STRATUM , weight=~WTFA , data=imputationList(list(z1,z2,z3,z4,z5)) , nest=T ) | |
#delete the y and z data frames to free up RAM | |
x <- z1 <- z2 <- z3 <- z4 <- z5 <- NULL | |
#garbage collection: this frees up RAM | |
gc() | |
################################################################## | |
#now that the R survey object (nhissvy) has been constructed, | |
#analyses can be run. | |
#the following output matches PDF page 60 on http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf | |
#this displays the crosstab statistics.. | |
#not broken out by the POVERTYI variable | |
#print the unweighted N | |
MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , unwtd.count( ~factor(NOTCOV) , na.rm=T ) ) ) | |
#print the weighted N | |
MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , svytotal( ~factor(NOTCOV) , na.rm=T ) ) ) | |
#print the overall percents | |
MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , svymean( ~factor(NOTCOV) , na.rm=T ) ) ) | |
#broken out by the POVERTYI variable | |
#print the unweighted N | |
MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , unwtd.count , na.rm=T ) ) ) | |
#print the weighted N | |
MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , svytotal , na.rm=T ) ) ) | |
#print the row percents | |
MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , svymean , na.rm=T ) ) ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment