Last active
July 14, 2018 19:33
-
-
Save rcdilorenzo/bf7fc25b91cf6653ff29f4111dc47156 to your computer and use it in GitHub Desktop.
Collection of helpful EDA functions in R (originally created for M.S. in Data Science assignment work at Regis University)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
hist.density <- function (data, xlab = "<x>", font.main = 1, | |
main = "Histogram of data") { | |
# Calculate histogram based on these values | |
data.hist = hist(data, plot = FALSE) | |
# Determine scaling factor | |
multiplier = data.hist$counts / data.hist$density | |
# Create density function of the area | |
data.density = density(data) | |
# Scale y-axis of density to histogram limits | |
data.density$y = data.density$y * multiplier[1] | |
# Plot histgram with labels and limits | |
plot(data.hist, xlab = xlab, main = main, font.main = font.main, | |
ylim = c(0, max(c(data.density$y, data.hist$counts)))) | |
# Add density function as a line overlay | |
lines(data.density) | |
} | |
dist.summary <- function (dataframe, vdescriptions = names(dataframe), | |
rows = 3, columns = 3, | |
main = 'Distribution of Variables', | |
mar = c(1, 1, 2, 0), oma = c(1, 1, 3, 1)) { | |
# Setup layout and spacing | |
total = rows * columns | |
layout(mat = matrix(1:(total * 2), rows * 2, columns, byrow = F), height = c(rep(c(2, 3), total))) | |
par(mar = mar, oma = oma) | |
for (index in 1:ncol(dataframe)) { | |
column = dataframe[,names(dataframe)[index]] | |
# Check for discrete / continuous | |
if (is.factor(column)) { | |
# Default padding | |
par(mai = rep(0.3, 4)) | |
plot.new() | |
title(main = vdescriptions[index]) | |
# Padding (except top) | |
par(mai = c(0.3, 0.3, 0, 0.3)) | |
# Display bar plot of frequencies | |
barplot(table(column)) | |
} else { | |
# Padding (expanded for top and none for bottom) | |
par(mai = c(0, 0.3, 0.5, 0.3)) | |
boxplot(column, cex = 0.8, horizontal = T, pch = '.', | |
main = vdescriptions[index], outline = F, axes = F) | |
# Padding (except top) | |
par(mai = c(0.3, 0.3, 0, 0.3)) | |
# Display histogram and density function | |
hist.density(column, font.main = 1, main = '') | |
} | |
} | |
title(main = main, outer = T) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment