Created
January 8, 2014 16:49
-
-
Save derrickturk/8319977 to your computer and use it in GitHub Desktop.
Here's a simple but useful function for quickly reviewing distributions and missing data. Suitable for preliminary data QC and exploratory analysis.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# (c) 2013 dwt | terminus data science, LLC | |
# freely licensed for non-commercial use | |
qc.plots <- function(df, max.levels=25, ask=TRUE, col="white") | |
{ | |
old.par <- par(ask=ask) | |
on.exit(par(old.par)) | |
for (var in names(df)) { | |
vals <- df[[var]] | |
if (is.numeric(vals)) { | |
n.missing <- sum(is.na(vals)) | |
pct.missing <- n.missing / length(vals) * 100 | |
hist(vals, main=var, col=col) | |
if (n.missing != 0) { | |
legend("topright", | |
paste("# missing: ", n.missing, "\n", | |
"% missing: ", format(pct.missing, digits=3), | |
sep=""), | |
bty="n", | |
text.col="red") | |
} | |
} else { | |
with.NAs <- factor(vals, exclude=NULL) | |
n.levels <- length(attr(with.NAs, "levels")) | |
cols <- rep(col, n.levels) | |
if (any(is.na(vals))) { | |
attr(with.NAs, "levels")[n.levels] <- "<missing>" | |
cols[n.levels] <- "red" | |
} | |
if (n.levels > max.levels) { | |
plot(1, type="n", axes=F, xlab="", ylab="", main=var) | |
legend("top", paste("too many levels:", n.levels), text.col="red", bty="n") | |
legend("center", paste(attr(with.NAs, "levels"), "\n", sep=""), bty="n") | |
} else { | |
barplot(table(with.NAs), main=var, ylab="Frequency", col=cols) | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment