Last active
February 24, 2016 22:14
-
-
Save ajdamico/4cd5f76aebbdaae5bc88 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(downloader) | |
# install.packages( c("MonetDB.R", "MonetDBLite" , "survey" , "SAScii" , "descr" , "downloader" , "digest" , "stringr" , "R.utils" , "RCurl" ) , repos=c("http://dev.monetdb.org/Assets/R/", "http://cran.rstudio.com/")) | |
library(SAScii) # load the SAScii package (imports ascii data with a SAS script) | |
library(RCurl) # load RCurl package (downloads https files) | |
library(stringr) # load stringr package (manipulates character strings easily) | |
library(downloader) # downloads and then runs the source() function on scripts from github | |
library(MonetDB.R) # load the MonetDB.R package (connects r to a monet database) | |
library(MonetDBLite) # load MonetDBLite package (creates database files in R) | |
library(descr) # load the descr package (converts fixed-width files to delimited files) | |
library(R.utils) # load the R.utils package (counts the number of lines in a file quickly) | |
library(foreign) # load foreign package (converts data files into R) | |
tf <- tempfile() | |
source_url( "https://raw.githubusercontent.com/ajdamico/asdfree/master/MonetDB/read.SAScii.monetdb.R" , prompt = FALSE ) | |
dbfolder <- paste0( getwd() , "/MonetDB" ) | |
db <- dbConnect( MonetDBLite() , dbfolder ) | |
datafolder <- paste0( getwd() , "/data" ) | |
dir.create( datafolder ) | |
# initiate a curl handle so the remote server knows it's you. | |
curl = getCurlHandle() | |
# set a cookie file on the local disk | |
curlSetOpt( | |
cookiejar = 'cookies.txt' , | |
followlocation = TRUE , | |
autoreferer = TRUE , | |
curl = curl | |
) | |
dp <- "http://www.icpsr.umich.edu/cgi-bin/bob/terms2?study=36044&ds=1&bundle=ascsas&path=NACJD" | |
# post your username and password to the umich server | |
login.page <- | |
postForm( | |
"http://www.icpsr.umich.edu/ticketlogin" , | |
email = your.username , | |
password = your.password , | |
path = "NACJD" , | |
request_uri = dp , | |
style = "POST" , | |
curl = curl | |
) | |
# consent to terms of use page | |
terms.of.use.page <- | |
postForm( | |
"http://www.icpsr.umich.edu/cgi-bin/terms" , | |
agree = 'yes' , | |
path = "NACJD" , | |
study = "36044" , | |
ds = "1" , | |
bundle = "ascsas" , | |
dups = "yes" , | |
style = "POST" , | |
curl = curl | |
) | |
# download the current sas file onto the local disk | |
this.sas_ri <- getBinaryURL( dp , curl = curl ) | |
# save the actual downloaded-file to the filepath specified on the local disk | |
writeBin( this.sas_ri , paste0( datafolder , "/myfile.zip" ) ) | |
# unzip the downloaded file within the local drive | |
z <- unzip( paste0( datafolder , "/myfile.zip" ) , exdir = datafolder ) | |
# determine the filenames that end with `sas` | |
sas.import <- z[ grep( "sas$" , tolower( z ) ) ] | |
# determine the filenames containing the word `data` | |
data.file <- z[ grep( "data" , tolower( basename( z ) ) ) ] | |
tablename <- 'x36044_0001' | |
# read the data file into an r sqlite database | |
read.SAScii.monetdb( | |
fn = data.file , | |
sas_ri = sas.import , | |
tl = TRUE , # convert all column names to lowercase? | |
tablename = tablename , | |
skip.decimal.division = TRUE , | |
conn = db | |
) | |
# figure out which variables need to be recoded to system missing # | |
# read the entire sas import script into a character vector | |
recode.lines <- toupper( readLines( sas.import ) ) | |
# look for the start of the system missing recode block | |
mvr <- intersect( grep( "RECODE TO SAS SYSMIS" , recode.lines ) , grep( "USER-DEFINED MISSING VALUE" , recode.lines ) ) | |
# if there's just one.. | |
if ( length( mvr ) == 1 ){ | |
# isolate the recode lines | |
recode.lines <- recode.lines[ mvr:length( recode.lines ) ] | |
# find all lines that start with an IF statement and end with a semicolon | |
lines.with.if <- grep( "IF (.*);" , recode.lines ) | |
# confirm all of those lines have a sas missing value (a dot) somewhere in there. | |
lines.with.dots <- grep( "\\." , recode.lines ) | |
# if the lines don't match up, fail cuz something's wrong. terribly terribly wrong. | |
if ( length( lines.with.if[ !( lines.with.if %in% lines.with.dots ) ] ) > 0 ) stop( "some recode line is recoding to something other than missing" ) | |
# further limit the recode lines to only lines containing an if block | |
recodes <- recode.lines[ lines.with.if ] | |
# break the recode lines up by semicolons, in case there's more than one per line | |
recodes <- unlist( strsplit( recodes , ";" ) ) | |
# remove the word `IF ` | |
recodes <- gsub( "IF " , "" , recodes ) | |
# remove leading and trailing whitespace | |
recodes <- str_trim( recodes ) | |
# remove empty strings | |
recodes <- recodes[ recodes != '' ] | |
# find which variables need to be replaced by extracting whatever's directly in front of the equals sign | |
vtr <- str_trim( tolower( gsub( "(.*) THEN( ?)(.*)( ?)=(.*)" , "\\3" , recodes ) ) ) | |
# remove everything after the `THEN` block.. | |
ptm <- gsub( " THEN( ?)(.*)" , "" , recodes ) | |
# ..to create a vector of patterns to match | |
ptm <- tolower( str_trim( ptm ) ) | |
} | |
print( ptm ) | |
if ( dbGetQuery( db , paste0( 'select count(*) from ' , tablename ) )[ 1 , 1 ] < 100000 ){ | |
print('before dbreadtable') | |
# pull the data file into working memory | |
x <- dbReadTable( db , tablename ) | |
print('after dbreadtable') | |
# if there are any missing values to recode | |
if ( length( mvr ) == 1 ){ | |
print('after length(mvr)==1') | |
# loop through each variable to recode | |
for ( k in seq_along( vtr ) ){ | |
print('after k in seq_along(vtr)') | |
# overwrite sas syntax with r syntax in the patterns to match commands. | |
r.command <- gsub( "=" , "==" , ptm[ k ] ) | |
r.command <- gsub( " or " , "|" , r.command ) | |
r.command <- gsub( " and " , "&" , r.command ) | |
r.command <- gsub( " in \\(" , " %in% c\\(" , r.command ) | |
cat( r.command , '\r\n' ) | |
# wherever the pattern has been matched, overwrite the current variable with a missing | |
x[ with( x , which( eval( parse( text = r.command ) ) ) ) , vtr[ k ] ] <- NA | |
# if a column is *only* NAs then delete it | |
if( all( is.na( x[ , vtr[ k ] ] ) ) ) x[ , vtr[ k ] ] <- NULL | |
# clear up RAM | |
gc() | |
} | |
print("before remove table") | |
# remove the current data table from the database | |
dbRemoveTable( db , tablename ) | |
print("before write table" ) | |
# ..and overwrite it with the data.frame object | |
# that you've just blessedly cleaned up | |
dbWriteTable( db , tablename , x ) | |
} | |
print('before save' ) | |
# save the r data.frame object to the local disk as an `.rda` | |
save( x , file = gsub( "\\-Data\\.txt$" , "rda" , data.file ) ) | |
# remove the object from working memory | |
rm( x ) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment