ajdamico · February 12, 2012 00:13
diff --git a/using R to replicate the NHIS Multiple Imputation technique.R b/using R to replicate the NHIS Multiple Imputation technique.R
 #page 118 of the NHIS document
 #ftp://ftp.cdc.gov/pub/health_statistics/nchs/dataset_documentation/nhis/2010/srvydesc.pdf
 #displays the R code to load the persons file into R as a survey object
 #the code below creates a slightly different survey object, one that includes appropriately-imputed income.


 #this R code:
 #	reads the year 2000 personsx file into R
 #	reads in all five imputed income files
 #	merges the 2000 personsx file with the five imputed income files
 #	creates a MI survey object
 #	runs six example analyses with the multiply-imputed R survey object

 #set the number of digits displayed
 options(digits=16)

 #NOTE that the three packages below must be installed (just once)
 #if this is the first time using R with these packages, run this line:
 #install.packages(c("mitools","survey","sas7bdat"))

 #load the multiple imputation package
 library(mitools)

 #load the complex survey analysis package
 library(survey)

 #set the current working directory to the location where the 2000 personsx SAS data set is stored.
 setwd("//dcdata/v/National Health Interview Survey/Income Imputation/2000")

 #load the .sas7bdat importation package
 library(sas7bdat)

 ###############################
 #import NHIS 2000 PERSONSX file

 #this can be done a number of different ways

 #read in the NHIS 2000 PERSONSX file from a SAS file  (this takes a while)
 x <- read.sas7bdat( "personsx.sas7bdat" )
 #now the NHIS 2000 PERSONSX file is a data frame (x) stored in R!

 #if using read.sas7bdat, these four columns will be factor variables
 #and must be converted to numeric ones in order for the merge to work

 for ( i in c("SRVY_YR","HHX","FMX","PX") ){
 	 x[ , i ] <- as.numeric( as.character( x[ , i ] ) )
 }

 #end import of PERSONSX file
 ###############################



 #loop through j = 1 through 5 and..
 #read in the five imputed income files. (this also takes a while)
 for ( j in 1:5 ){
 	#print the current iteration of the loop
 	print( j )

 	#set the current working directory to the location where the five 2000 imputed income SAS data sets are stored.
 	setwd("//dcdata/v/National Health Interview Survey/Income Imputation/2000")

 	#read in the SAS imputed income file number "j"
 	y <- read.sas7bdat(paste( "incmimp" , j , ".sas7bdat", sep=""))

 	#dump RECTYPE variable, which is already on the PERSONSX file
 	y$RECTYPE <- NULL
 	
 	#merge the personsx file with the imputed income file
 	z <- merge( x , y , by.x=c("SRVY_YR","HHX","FMX","PX") , by.y=c("SRVY_YR","HHX","FMX","FPX") )
 	
 	#print the number of rows in the personsx data frame and
 	#also the number of rows in the merged personsx-imputed income data frame
 	#to make sure they are the same!
 	print( nrow( x ) )
 	print( nrow( y ) )
 	print( nrow( z ) )
 	
 	###########################
 	#START OF VARIABLE RECODING
 	#any new variables that the user would like to create should be constructed here
 	
 	#create the NOTCOV variable
 	#shown on page 47 (PDF page 51) of http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf
 	z <- transform( z , NOTCOV = ifelse( NOTCOV %in% 7:9 , NA , NOTCOV ))
 	
 	#create the POVERTYI variable
 	#shown on page 48 (PDF page 52) of http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf
 	z <- transform( z , POVERTYI =
 		ifelse( RAT_CATI %in% 1:3 , 1 ,
 		ifelse( RAT_CATI %in% 4:7 , 2 ,
 		ifelse( RAT_CATI %in% 8:11 , 3 ,
 		ifelse( RAT_CATI %in% 12:14 , 4 , NA ) ) ) ) )

 	#END OF VARIABLE RECODING
 	#########################
 	
 	#########################
 	#delete columns you don't need to free up RAM
 	mini_z <- z[ , c("PSU","STRATUM","WTFA","POVERTYI","NOTCOV") ]
 	#end of column deletions
 	#########################
 	
 	#save the current data frame (z) as z1, z2, z3, z4, or z5
 	#depending on the current iteration of the j = 1 through 5 loop
 	assign( paste( "z" , j , sep = "" ) , mini_z )
 	
 	#delete the y and z data frames to free up RAM
 	y <- z <- NULL
 	
 	#garbage collection: this frees up RAM
 	gc()
 }
 #when the loop has terminated, data frames z1 through z5 exist
 #each are the personsx file merged with one of the five imputed income files
 #and each include all recoded variables.


 #using all five merged personsx-MI files,
 #create the multiple imputation survey object
 nhissvy <- svydesign( id = ~PSU , strata=~STRATUM , weight=~WTFA , data=imputationList(list(z1,z2,z3,z4,z5)) , nest=T )

 #delete the y and z data frames to free up RAM
 x <- z1 <- z2 <- z3 <- z4 <- z5 <- NULL

 #garbage collection: this frees up RAM
 gc()


 ##################################################################
 #now that the R survey object (nhissvy) has been constructed,
 #analyses can be run.

 #the following output matches PDF page 60 on http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf

 #this displays the crosstab statistics..
 	#not broken out by the POVERTYI variable

 #print the unweighted N
 MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , unwtd.count( ~factor(NOTCOV) , na.rm=T ) ) )
 #print the weighted N
 MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , svytotal( ~factor(NOTCOV) , na.rm=T ) ) )
 #print the overall percents
 MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , svymean( ~factor(NOTCOV) , na.rm=T ) ) )

 	#broken out by the POVERTYI variable

 #print the unweighted N
 MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , unwtd.count , na.rm=T ) ) )
 #print the weighted N
 MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , svytotal , na.rm=T ) ) )
 #print the row percents
 MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , svymean , na.rm=T ) ) )
	#page 118 of the NHIS document
	#ftp://ftp.cdc.gov/pub/health_statistics/nchs/dataset_documentation/nhis/2010/srvydesc.pdf
	#displays the R code to load the persons file into R as a survey object
	#the code below creates a slightly different survey object, one that includes appropriately-imputed income.


	#this R code:
	# reads the year 2000 personsx file into R
	# reads in all five imputed income files
	# merges the 2000 personsx file with the five imputed income files
	# creates a MI survey object
	# runs six example analyses with the multiply-imputed R survey object

	#set the number of digits displayed
	options(digits=16)

	#NOTE that the three packages below must be installed (just once)
	#if this is the first time using R with these packages, run this line:
	#install.packages(c("mitools","survey","sas7bdat"))

	#load the multiple imputation package
	library(mitools)

	#load the complex survey analysis package
	library(survey)

	#set the current working directory to the location where the 2000 personsx SAS data set is stored.
	setwd("//dcdata/v/National Health Interview Survey/Income Imputation/2000")

	#load the .sas7bdat importation package
	library(sas7bdat)

	###############################
	#import NHIS 2000 PERSONSX file

	#this can be done a number of different ways

	#read in the NHIS 2000 PERSONSX file from a SAS file (this takes a while)
	x <- read.sas7bdat( "personsx.sas7bdat" )
	#now the NHIS 2000 PERSONSX file is a data frame (x) stored in R!

	#if using read.sas7bdat, these four columns will be factor variables
	#and must be converted to numeric ones in order for the merge to work

	for ( i in c("SRVY_YR","HHX","FMX","PX") ){
	x[ , i ] <- as.numeric( as.character( x[ , i ] ) )
	}

	#end import of PERSONSX file
	###############################



	#loop through j = 1 through 5 and..
	#read in the five imputed income files. (this also takes a while)
	for ( j in 1:5 ){
	#print the current iteration of the loop
	print( j )

	#set the current working directory to the location where the five 2000 imputed income SAS data sets are stored.
	setwd("//dcdata/v/National Health Interview Survey/Income Imputation/2000")

	#read in the SAS imputed income file number "j"
	y <- read.sas7bdat(paste( "incmimp" , j , ".sas7bdat", sep=""))

	#dump RECTYPE variable, which is already on the PERSONSX file
	y$RECTYPE <- NULL

	#merge the personsx file with the imputed income file
	z <- merge( x , y , by.x=c("SRVY_YR","HHX","FMX","PX") , by.y=c("SRVY_YR","HHX","FMX","FPX") )

	#print the number of rows in the personsx data frame and
	#also the number of rows in the merged personsx-imputed income data frame
	#to make sure they are the same!
	print( nrow( x ) )
	print( nrow( y ) )
	print( nrow( z ) )

	###########################
	#START OF VARIABLE RECODING
	#any new variables that the user would like to create should be constructed here

	#create the NOTCOV variable
	#shown on page 47 (PDF page 51) of http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf
	z <- transform( z , NOTCOV = ifelse( NOTCOV %in% 7:9 , NA , NOTCOV ))

	#create the POVERTYI variable
	#shown on page 48 (PDF page 52) of http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf
	z <- transform( z , POVERTYI =
	ifelse( RAT_CATI %in% 1:3 , 1 ,
	ifelse( RAT_CATI %in% 4:7 , 2 ,
	ifelse( RAT_CATI %in% 8:11 , 3 ,
	ifelse( RAT_CATI %in% 12:14 , 4 , NA ) ) ) ) )

	#END OF VARIABLE RECODING
	#########################

	#########################
	#delete columns you don't need to free up RAM
	mini_z <- z[ , c("PSU","STRATUM","WTFA","POVERTYI","NOTCOV") ]
	#end of column deletions
	#########################

	#save the current data frame (z) as z1, z2, z3, z4, or z5
	#depending on the current iteration of the j = 1 through 5 loop
	assign( paste( "z" , j , sep = "" ) , mini_z )

	#delete the y and z data frames to free up RAM
	y <- z <- NULL

	#garbage collection: this frees up RAM
	gc()
	}
	#when the loop has terminated, data frames z1 through z5 exist
	#each are the personsx file merged with one of the five imputed income files
	#and each include all recoded variables.


	#using all five merged personsx-MI files,
	#create the multiple imputation survey object
	nhissvy <- svydesign( id = ~PSU , strata=~STRATUM , weight=~WTFA , data=imputationList(list(z1,z2,z3,z4,z5)) , nest=T )

	#delete the y and z data frames to free up RAM
	x <- z1 <- z2 <- z3 <- z4 <- z5 <- NULL

	#garbage collection: this frees up RAM
	gc()


	##################################################################
	#now that the R survey object (nhissvy) has been constructed,
	#analyses can be run.

	#the following output matches PDF page 60 on http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf

	#this displays the crosstab statistics..
	#not broken out by the POVERTYI variable

	#print the unweighted N
	MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , unwtd.count( ~factor(NOTCOV) , na.rm=T ) ) )
	#print the weighted N
	MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , svytotal( ~factor(NOTCOV) , na.rm=T ) ) )
	#print the overall percents
	MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , svymean( ~factor(NOTCOV) , na.rm=T ) ) )

	#broken out by the POVERTYI variable

	#print the unweighted N
	MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , unwtd.count , na.rm=T ) ) )
	#print the weighted N
	MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , svytotal , na.rm=T ) ) )
	#print the row percents
	MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , svymean , na.rm=T ) ) )