digdeep · May 26, 2014 13:54
diff --git a/multiFile.R b/multiFile.R
 setwd("/YOUR/WORKING/DIRECTORY")
 file_list <- list.files()
 ## for individual files
 dataset <- lapply(file_list,FUN=function(files){read.table(files,header=TRUE, sep=",")})
 ## If your CSV column structure is same across all csv's
 dataset <- do.call("rbind",lapply(file_list,FUN=function(files){read.table(files,header=TRUE, sep=",")}))



 temp <- list.files(pattern = "*.csv")
 ## for individual files
 dataset <- lapply(temp,FUN=function(files){read.table(files,header=TRUE, sep=",")})
 dataset[1] ## for specific files of interest, OR
 ## If your CSV column structure is same across all csv's bind them all into 1 file
 dataset <- do.call("rbind",lapply(temp,FUN=function(files){read.table(files,header=TRUE, sep=",")}))
diff --git a/NASummary.R b/NASummary.R
 NAsummary = function (df, include.nan = FALSE) 
 {
  newdf = data.frame(col = 1:ncol(df), Count = nrow(df), nNA = sapply(df, 
                                                                      function(x) length(x[is.na(x)])))
  newdf$rNA = newdf$nNA/newdf$Count
  newdf$rNA = trunc(newdf$rNA * 10000)/10000
  if (include.nan) {
    newdf$nNan = sapply(df, function(x) length(x[is.nan(x)]))
    newdf$rNan = newdf$nNan/newdf$Count
    newdf$rNan = trunc(newdf$rNan * 10000)/10000
  }
  newdf$nUnique = sapply(df, function(x) length(unique(x)))
  newdf$rUnique = newdf$nUnique/newdf$Count
  newdf$rUnique = trunc(newdf$rUnique * 10000)/10000
  rownames(newdf) = colnames(df)
  return(newdf)
 }
diff --git a/transform.R b/transform.R
 spread <- function(x) {
  n <- length(x)
  n.med <- (n + 1)/2
  n.fourth <- (floor(n.med) + 1)/2
  y <- sort(x)[c(floor(n.fourth), ceiling(n.fourth), 
                 floor(n+1 - n.fourth), ceiling(n+1 - n.fourth))]
  return( y %*% c(-1,-1,1,1)/2 )
 }
 years <- floor((1:length(x) - 1) / 12)
 z <- split(x, years)
 boxplot(z, names=(min(years):max(years))+2010, ylab="y")
 #Spread VS Lever Plot
 z.med <- unlist(lapply(z, median))
 z.spread <- unlist(lapply(z, spread))
 fit <- lm(log(z.spread) ~ log(z.med))
 plot(log(z.med), log(z.spread), xlab="Log Level", ylab="Log Spread", 
     main="Spread vs. Level Plot")
 abline(fit, lwd=2, col="Red")
 #LAMBDA
 lambda <- 1 - coef(fit)[2]
 boxplot(lapply(z, function(u) u^lambda), names=(min(years):max(years))+2010, 
        ylab=paste("y^", round(lambda, 2), sep=""),
        main="Boxplots of Re-expressed Values")
	setwd("/YOUR/WORKING/DIRECTORY")
	file_list <- list.files()
	## for individual files
	dataset <- lapply(file_list,FUN=function(files){read.table(files,header=TRUE, sep=",")})
	## If your CSV column structure is same across all csv's
	dataset <- do.call("rbind",lapply(file_list,FUN=function(files){read.table(files,header=TRUE, sep=",")}))



	temp <- list.files(pattern = "*.csv")
	## for individual files
	dataset <- lapply(temp,FUN=function(files){read.table(files,header=TRUE, sep=",")})
	dataset[1] ## for specific files of interest, OR
	## If your CSV column structure is same across all csv's bind them all into 1 file
	dataset <- do.call("rbind",lapply(temp,FUN=function(files){read.table(files,header=TRUE, sep=",")}))
	NAsummary = function (df, include.nan = FALSE)
	{
	newdf = data.frame(col = 1:ncol(df), Count = nrow(df), nNA = sapply(df,
	function(x) length(x[is.na(x)])))
	newdf$rNA = newdf$nNA/newdf$Count
	newdf$rNA = trunc(newdf$rNA * 10000)/10000
	if (include.nan) {
	newdf$nNan = sapply(df, function(x) length(x[is.nan(x)]))
	newdf$rNan = newdf$nNan/newdf$Count
	newdf$rNan = trunc(newdf$rNan * 10000)/10000
	}
	newdf$nUnique = sapply(df, function(x) length(unique(x)))
	newdf$rUnique = newdf$nUnique/newdf$Count
	newdf$rUnique = trunc(newdf$rUnique * 10000)/10000
	rownames(newdf) = colnames(df)
	return(newdf)
	}
	spread <- function(x) {
	n <- length(x)
	n.med <- (n + 1)/2
	n.fourth <- (floor(n.med) + 1)/2
	y <- sort(x)[c(floor(n.fourth), ceiling(n.fourth),
	floor(n+1 - n.fourth), ceiling(n+1 - n.fourth))]
	return( y %*% c(-1,-1,1,1)/2 )
	}
	years <- floor((1:length(x) - 1) / 12)
	z <- split(x, years)
	boxplot(z, names=(min(years):max(years))+2010, ylab="y")
	#Spread VS Lever Plot
	z.med <- unlist(lapply(z, median))
	z.spread <- unlist(lapply(z, spread))
	fit <- lm(log(z.spread) ~ log(z.med))
	plot(log(z.med), log(z.spread), xlab="Log Level", ylab="Log Spread",
	main="Spread vs. Level Plot")
	abline(fit, lwd=2, col="Red")
	#LAMBDA
	lambda <- 1 - coef(fit)[2]
	boxplot(lapply(z, function(u) u^lambda), names=(min(years):max(years))+2010,
	ylab=paste("y^", round(lambda, 2), sep=""),
	main="Boxplots of Re-expressed Values")