khakieconomics’s gists

khakieconomics / truncnorm_ng.stan

Created August 15, 2017 20:55

Unconstrained quantile to truncated normal conversion

	functions {
	// lower bound is a, upper bound is b, rv is x, mean is mu, sd is sigma
	vector xi(vector x, real mu, real sigma) {
	return((x - mu)./sigma);
	}
	real alpha(real a, real mu, real sigma) {
	real out;
	out = (a==negative_infinity())? negative_infinity(): (a - mu)/sigma;
	return(out);
	}

khakieconomics / biprobit.stan

Created September 26, 2017 23:06

	functions {
	real binormal_cdf(real z1, real z2, real rho) {
	if (z1 != 0 \|\| z2 != 0) {
	real denom = fabs(rho) < 1.0 ? sqrt((1 + rho) * (1 - rho)) : not_a_number();
	real a1 = (z2 / z1 - rho) / denom;
	real a2 = (z1 / z2 - rho) / denom;
	real product = z1 * z2;
	real delta = product < 0 \|\| (product == 0 && (z1 + z2) < 0);
	return 0.5 * (Phi(z1) + Phi(z2) - delta) - owens_t(z1, a1) - owens_t(z2, a2);
	}

khakieconomics / run_biprobit.R

Created September 26, 2017 23:06

	library(rstan); library(dplyr); library(ggplot2); library(reshape2)
	options(mc.cores = parallel::detectCores())

	set.seed(49)
	# Observations
	N <- 2000

	# Draw errors
	rho <- 0.5
	Sigma <- matrix(c(1, rho, rho, 1), 2, 2)

khakieconomics / classic_iv.stan

Created October 13, 2017 19:33

Classic instrumental variables regression model for Stan

	data {
	int N;
	int PX; // dimension of exogenous covariates
	int PN; // dimension of endogenous covariates
	int PZ; // dimension of instruments
	matrix[N, PX] X_exog; // exogenous covariates
	matrix[N, PN] X_endog; // engogenous covariates
	matrix[N, PZ] Z; // instruments
	vector[N] Y_outcome; // outcome variable
	int<lower=0,upper=1> run_estimation; // simulate (0) or estimate (1)

khakieconomics / hierarchical_bayes_diagonal_covariance.stan

Created March 7, 2018 13:04

	data {
	int N; // number of rows
	int T; // number of inidvidual-choice sets/task combinations
	int I; // number of Individuals
	int P; // number of covariates

	vector<lower = 0, upper = 1>[N] choice; // binary indicator for choice
	matrix[N, P] X; // product attributes

	int task[T]; // index for tasks

khakieconomics / hierarchical_bayes_full_covmat.stan

Created March 7, 2018 13:05

	data {
	int N; // number of rows
	int T; // number of inidvidual-choice sets/task combinations
	int I; // number of Individuals
	int P; // number of covariates

	vector<lower = 0, upper = 1>[N] choice; // binary indicator for choice
	matrix[N, P] X; // product attributes

	int task[T]; // index for tasks

khakieconomics / very_large_classification.R

Created March 16, 2018 11:31

	library(dplyr)

	# Let's make some fake data!
	# These will contain true labels and 500 binary features
	# We'll follow the general approach here: http://modernstatisticalworkflow.blogspot.com/2018/03/1000-labels-and-4500-observations-have.html
	# but rather than using a penalized likelihood estimator for the label-activity probabilities, we'll approximate
	# it by taking the average of the frequency of activity within label and 0.5.

	N <- 100000
	I <- 30000

khakieconomics / dont_fit_random_effects_with_optimization.R

Last active March 21, 2018 12:28


	# This illustrates why Neal's Funnel really messes with us when we're trying
	# to estimate random effects models using maximum likelihood. Ie. why we need
	# approximations with lme4, INLA, or need to go full Bayes. The mode is an
	# awful one.

	# Let's simulate some data from a really simple model with 10
	# random intercepts whose variance you want to estimate

	N <- 1000

khakieconomics / never_use_neural_networks_when_matrix_multiplication_will_do.R

Created March 30, 2018 20:32

	# This gist uses the classifier defined in this post: http://modernstatisticalworkflow.blogspot.com/2018/03/1000-labels-and-4500-observations-have.html
	# applied with this approximation: https://gist.github.com/khakieconomics/0325c054b1499d5037a1de5d1014645a
	# to Kaggle's credit card fraud data--a fun rare-case problem. In a cross-validation exercise with 50 randomly selected hold-out
	# sets, it appears perform similarly (or perhaps better) than others' attempts using random forests and neural networks.
	# The upside of course is that it estimates/generates predictions in a couple of seconds.

	library(tidyverse); library(reticulate); library(pROC)

	# Download some data from Kaggle
	system("kaggle datasets download -d mlg-ulb/creditcardfraud -p ~/Documents/kagglestuff")

khakieconomics / double_ml_hierarchical_bayes.R

Last active February 17, 2019 15:35

	library(rstanarm); library(tidyverse)
	options(mc.cores = parallel::detectCores())
	set.seed(42)
	data("radon")
	head(treatment_sample)

	# Some levels have no variance in the outcomes, making likelihood estimates impossible
	# Adding a tiny bit of noise fixes the problem
	radon$log_uranium <- rnorm(nrow(radon), radon$log_uranium, 0.05)

Jim khakieconomics