Skip to content

Instantly share code, notes, and snippets.

View danlewer's full-sized avatar

Dan Lewer danlewer

View GitHub Profile
install.packages('Rtools')
install.packages('data.table')
install.packages('stringi')
install.packages('stringr')
install.packages('Amelia')
install.packages('epitools')
install.packages('epiR')
install.packages('RColorBrewer')
install.packages('viridis')
install.packages('viridisLite')
@danlewer
danlewer / montyHall.R
Created December 3, 2021 08:47
Simulate the Monty Hall problem
# function runs the monty hall problem 'n' times and returns the proportions of 'wins' under sticking and switching strategies
montyHall <- function (n = 1e5) {
prize <- sample(1:3, n, replace = T)
firstChoice <- sample(1:3, n, replace = T)
reveal <- matrix(rep.int(1L, n * 3), ncol = 3)
reveal[cbind(rep(seq_len(n), 2), c(prize, firstChoice))] <- 0L
reveal <- max.col(reveal, ties.method = 'random')
switchTo <- 6L - (firstChoice + reveal)
c(stick = mean(firstChoice == prize), switch = mean(switchTo == prize))
# Calculate predicted Forced Exhaled Volume in 1 second (FEV1), based on age, sex, height, and ethnicity
# Function takes vectors
# Formula is from:
# ERS TASK FORCE REPORT. Multi-ethnic reference values for spirometry for the 3-95-yr age range: the global lung function 2012 equations
# Philip H.
# Source: Guideline 2012
# https://www.ers-education.org/lr/show-details/?idP=138497
fev1pred <- function (sex = 'f', heightCM = 180, ageYears = 50, eth = 'White') {
ind <- (sex == 'f') + 1
# following the observation in Lewis et al (2005) https://onlinelibrary.wiley.com/doi/abs/10.1002/pds.1115
# that incidence of various events is higher in the months after patients join an electronic database
# this code provides a function for measuring incidence stratified by time after joining a cohort
# -- function reporting incidence stratified by duration after cohort entry --
# reformat dates as integers with an arbitrary origin, e.g. 1970-01-01
# entry = date of cohort entry
# exit = date of cohort exit
# diagnosis = date of event
# enter a maximum value in your data and generate tick-points for a plot axis
yax <- function(x, tickabove = F, ntick = 5) {
l <- c(c(1, 2, 4, 5, 25) %o% 10^(0:8))
d <- l[which.min(abs(x/ntick - l))]
d <- 0:(ntick+1) * d
i <- findInterval(x, d)
if (tickabove) {i <- i + 1}
d[seq_len(i)]
}
objectSizeTable <- function(unit = 1e6, env = .GlobalEnv, top = 10, totals = T) { # 1e6 approximates Mb (1e9 for Gb)
o <- ls(envir = env)
y <- data.frame(object = o, size = sapply(o, function(x) object.size(get(x, envir = env))))
y$size <- y$size / unit
y <- y[order(y$size, decreasing = T),]
if (totals) {y <- rbind(data.frame(object = c('total', 'other_environments'), size = c(sum(y$size), memory.size() - sum(y$size))), y)}
rownames(y) <- NULL
if (is.na(top)) return(y) else return(y[seq_len(top + (2 * totals)),])
}
# vectorised confidence intervals for poisson count in R
# wrapper for base::poisson.test
# returns rate and 95% confidence interval
# for use with vectors of counts of person-times
# x = vector of counts
# t = vector of corresponding person-times
# form = output format. T = formatted; F = matrix of rate, lower, upper
# digs = number of digits if form = T
# uses 'mapply' so not actually vectorized for performance
# ... additional arguments passed to poisson.test (e.g. 'confidence.level')
# make a merrimekko chart (100% stacked bar chart with variable widths)
# m is a matrix
# cols is a vector of colours of same length as number of rows in matrix
# ... is other arguments passed to plot
# set par(xpd = NA) to see x-axis labels
mm <- function(m, cols = NA, ...) {
widths <- colSums(m)
xs <- c(0, cumsum(widths)) / sum(m)
xl <- rep(xs[-length(xs)], each = nrow(m)) # x-left
# 'a' is a vector
# 'vals' is a vector
# returns a vector of same lengths as 'a' showing the last value in 'a' equal to any value in 'vals'
# 'fill' is the value given if none of the values in 'vals' has yet occurred in 'a'
# 'excl' is a vector specifying values of 'vals' that are to be excluded. Best used when vals is not specified (and defaults to unique(a))
last_status <- function(a, vals = unique(a), excl = NA, fill = NA) {
if (!is.na(excl)) {vals <- setdiff(vals, excl)}
i <- a %in% vals
j <- a[i][cumsum(i)]
# this code calculates a life table using mortality rates by single-year-of-age
# confidence intervals are calculated assuming that each year-of-age is an independent sample
#-----------------
# make sample data
#-----------------
set.seed(56)
dat <- data.frame(age = 0:99)
dat$person_years <- 50000 / (1 + exp(-(seq(10, -5, length.out = nrow(dat)))))