Created
July 16, 2020 06:13
-
-
Save mrdwab/f78f96ea337fbc3053a462c17963148a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Calculate the Mean of Already Grouped Data | |
#' | |
#' Calculates the mean of already grouped data given the interval ranges and | |
#' the frequencies of each group. | |
#' | |
#' @param frequencies A vector of frequencies. | |
#' @param intervals A 2-column `matrix` with the same number of rows as | |
#' the length of frequencies, with the first column being the lower class | |
#' boundary, and the second column being the upper class boundary. | |
#' Alternatively, `intervals` may be a character vector, and you may | |
#' specify `sep` (and possibly, `trim` function automatically create the | |
#' required `matrix`. | |
#' @param sep Optional character that separates lower and uppper class | |
#' boundaries if `intervals` is entered as a character vector. | |
#' @param trim Optional leading or trailing characters to trim from the | |
#' character vector being used for `intervals`. There is an in-built pattern | |
#' in the grouped functions to trim the breakpoint labels created by [base::cut()]. | |
#' If you are using a `grouped_*` function on the output of `cut` (where, for some | |
#' reason, you no longer have access to the original data), you can use | |
#' `trim = "cut"`. | |
#' @return A single numeric value representing the grouped mean, median, or | |
#' mode, depending on which function was called. | |
#' | |
#' @export grouped_mean | |
grouped_mean <- function(frequencies, intervals, sep = NULL, trim = NULL) { | |
intervals <- if (is.character(intervals)) .grp_intervals(intervals, sep, trim) else intervals | |
sum(rowMeans(intervals) * frequencies) / sum(frequencies) | |
} | |
#' @export grouped_mode | |
grouped_mode <- function(frequencies, intervals, sep = NULL, trim = NULL) { | |
intervals <- if (is.character(intervals)) .grp_intervals(intervals, sep, trim) else intervals | |
ind <- which.max(frequencies) | |
if (length(ind) > 1L) stop("Only for use where there are no ties for highest frequencies across groups.") | |
lw <- .grp_lw(intervals, ind) | |
fm0 <- if (ind == 1) 0 else frequencies[(ind-1)] | |
fm1 <- frequencies[ind] | |
fm2 <- if (ind == length(frequencies)) 0 else frequencies[(ind+1)] | |
lw[[1]] + ((fm1 - fm0) / (2*fm1 - fm0 - fm2)) * lw[[2]] | |
} | |
grouped_median <- function(frequencies, intervals, sep = NULL, trim = NULL) { | |
intervals <- if (is.character(intervals)) .grp_intervals(intervals, sep, trim) else intervals | |
cf <- cumsum(frequencies) | |
ind <- findInterval(max(cf)/2, cf) + 1 | |
lw <- .grp_lw(intervals, ind) | |
f <- frequencies[ind] | |
cf <- cf[(ind - 1)] | |
n <- sum(frequencies) | |
lw[[1]] + (n/2 - cf)/f * lw[[2]] | |
} | |
.grp_intervals <- function(intervals, sep, trim) { | |
if (!is.null(sep)) { | |
if (is.null(trim)) pattern <- "" | |
else if (trim == "cut") pattern <- "\\[|\\]|\\(|\\)" | |
else pattern <- trim | |
matrix( | |
as.numeric(unlist(strsplit(gsub(pattern, "", intervals), sep), use.names = FALSE)), | |
ncol = 2, byrow = TRUE) | |
} | |
} | |
.grp_lw <- function(intervals, ind) { | |
if (ind == 1) { | |
L <- intervals[ind, 1] | |
w <- abs(diff(intervals[ind, ])) | |
} else { | |
if (intervals[ind, 1] == intervals[(ind-1), 2]) { | |
L <- intervals[ind, 1] | |
w <- abs(diff(intervals[ind, ])) | |
} else { | |
L <- mean(c(intervals[ind, 1], intervals[(ind-1), 2])) | |
x <- abs(intervals[ind, 1] - L) | |
w <- abs((intervals[ind, 2] + x) - L) | |
} | |
} | |
list(L, w) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://people.umass.edu/biep540w/pdf/Grouped%20Data%20Calculation.pdf