Skip to content

Instantly share code, notes, and snippets.

@vankesteren
vankesteren / penalized_synthetic_control.R
Created November 24, 2023 09:58
penalized synthetic control estimation (Abadie & L'Hour, 2021)
#' Penalized synthetic control estimator
#'
#' Estimate synthetic control with penalization
#' according to Abadie & L'Hour.
#'
#' @param X1 treated unit covariates
#' @param X0 donor units covariates
#' @param v variable weights
#' @param lambda penalization parameter
#' @param ... osqp settings using osqp::osqpSettings()
@vankesteren
vankesteren / proportion_intervals.R
Created October 12, 2023 12:53
Uncertainty intervals for a proportion
# Different 95% uncertainty intervals for a proportion
dat <- c(rep(0, 38), rep(1, 2))
# normal approximation on probability scale
ci_normal <- function(dat) {
mu <- mean(dat)
se <- sqrt(mu * (1 - mu) / length(dat))
return(c(
"2.5 %" = mu + qnorm(0.025)*s_normal,
"97.5 %" = mu + qnorm(0.975)*se
@vankesteren
vankesteren / network_autocorrelation.R
Last active September 20, 2023 15:29
Network autocorrelation model
# simulate and estimate a network autocorrelation model
set.seed(45)
N <- 200
A <- matrix(rbinom(N*N, 1, 0.2), N)
diag(A) <- 0
A2 <- matrix(rbinom(N*N, 1, 0.2), N)
diag(A2) <- 0
An <- A / rowSums(A) # row-normalized ????
# params
@vankesteren
vankesteren / split_kfold_conformal_lm.R
Created September 13, 2023 13:33
Simple split & k-fold conformal prediction for linear regression model
# Conformal prediction for regression prediction intervals
# see example 2.3.1 from https://arxiv.org/pdf/2107.07511.pdf
# goal: make a 90% prediction interval for linear model
# First simulate some data with non-normal residuals
set.seed(45)
sim_data <- function(N) {
x <- runif(N, min = -4, max = 4)
y <- x + rgamma(N, 2, 1) - 2
data.frame(x = x, y = y)
@vankesteren
vankesteren / conformal.R
Last active September 13, 2023 13:30
Conformal prediction for linear regression and random forest. NB: pretty ugly and slow code.
# conformal prediction intervals for linear regression and random forest
library(tidyverse)
library(pbapply)
library(parallel)
# fully conformal prediction
conformal_quantile <- function(x, y, x_new, y_new, frm) {
N <- length(x)
df <- tibble(x = c(x, x_new), y = c(y, y_new))
fy <- lm(frm, df)
@vankesteren
vankesteren / quicksort.jl
Created August 13, 2023 10:37
Quicksort in Julia using recursion
function quicksort!(a::AbstractArray, lo::Int, hi::Int)
if lo > 0 && hi > 0 && lo < hi
p = partition!(a, lo, hi)
quicksort!(a, lo, p)
quicksort!(a, p + 1, hi)
end
end
quicksort!(a::AbstractArray) = quicksort!(a, firstindex(a), lastindex(a))
@vankesteren
vankesteren / subsample_and_aggregate.R
Last active August 7, 2023 13:04
Subsample and aggregate method from https://dl.acm.org/doi/pdf/10.1145/1993636.1993743 for estimation
#' Compute any function of single data vector in a differentially private way.
#'
#' The subsample-and-aggregate method applies the function to O(sqrt(n)) subsamples
#' of the data and then averages the function results using a differentially private
#' aggregation function called the "widened winsorized mean".
#'
#' @param x data, single vector
#' @param FUN function returning a single numeric value
#' @param epsilon differential privacy parameter
#' @param lower theoretical lower bound of interest for the function value
@vankesteren
vankesteren / marchenkopastur.R
Created July 31, 2023 14:02
3d-plot of Marchenko-Pastur law
# 3d-plot of Marchenko-Pastur law
# p 6 of High-Dim statistics (wainwright)
library(rgl)
alpha <- seq(0.01, 0.99, 0.01)
gamma <- seq(0, 4, length.out = 500)
fmp <- function(g, a) {
gtmin <- g - (1 - sqrt(a))^2
tmaxg <- (1 + sqrt(a))^2 - g
if (gtmin <= 0 | tmaxg <= 0) return(0)
sqrt(tmaxg * gtmin) / g # see errata https://people.eecs.berkeley.edu/~wainwrig/highdim_errata.txt
@vankesteren
vankesteren / lavaan_separate_joint.R
Created July 28, 2023 09:44
Double check separate vs. joint model in lavaan
library(lavaan)
# sim_dat function from my blog post
# https://erikjanvankesteren.nl/blog/multivariate_regression
sim_dat <- function(N = 100, P = 5, Q = 2, varprop = 0.5, rho_X = 0, rho_B = 0, rho_E = 0, exact = TRUE) {
# Create X matrix
X <- matrix(rnorm(N*P), N)
RX <- matrix(rho_X, P, P)
diag(RX) <- 1
CRX <- chol(RX)
import gzip
x1 = "Japan’s Seiko Epson Corp. has developed a 12-gram flying microrobot."
x2 = "The latest tiny flying robot has been unveiled in Japan."
x3 = "Michael Phelps won the gold medal in the 400 individual medley."
def ncd(x: str, y: str) -> float:
'''
Returns the normalized compression distance between two strings.