Skip to content

Instantly share code, notes, and snippets.

@vankesteren
Last active May 30, 2022 15:02
Show Gist options
  • Save vankesteren/b7c522e319a54944a1a581b7bb78a3ed to your computer and use it in GitHub Desktop.
Save vankesteren/b7c522e319a54944a1a581b7bb78a3ed to your computer and use it in GitHub Desktop.
MetaSynth in Julia
using Distributions
using Random
using DataFrames
struct MetaVariable
name::String
p_missing::Float64
dist::UnivariateDistribution
end
function MetaVariable(name::String, col::Vector, dist_options::Vector)
col_ = collect(skipmissing(col))
bic = Inf
fdist = Normal(0, 1)
for dist in dist_options
try
fitted_dist = fit(dist, col_)
bic_cur = length(params(fdist))*log(length(col_)) - 2*sum(logpdf.(fitted_dist, col_))
if bic_cur < bic
fdist = fitted_dist
bic = bic_cur
end
catch
@debug "Distribution '$dist' could not be estimated: $e"
end
end
return MetaVariable(name, (length(col) - length(col_)) / length(col), fdist)
end
function MetaVariable(name::String, col::Vector{Union{Missing, Float64}})
MetaVariable(name, col, [Normal, LogNormal, Exponential, Gamma, Cauchy, Beta])
end
function MetaVariable(name::String, col::Vector{Union{Missing, Int}})
MetaVariable(name, col, [Bernoulli, Binomial, Categorical, DiscreteUniform, Geometric])
end
function draw(x::MetaVariable, N::Int)
out = convert(Vector{Union{Missing, eltype(x.dist)}}, rand(x.dist, N)) # eltype is not the right thing here
out[rand(N) .< x.p_missing] .= missing
return out
end
struct MetaDataset
N::Int
vars::Vector{MetaVariable}
end
function MetaDataset(df::DataFrame)
N, P = size(df)
vars = []
for p in 1:P
col = df[:,p]
nm = names(df)[p]
push!(vars, MetaVariable(nm, col))
end
return MetaDataset(N, vars)
end
function draw(x::MetaDataset, N::Int)
return DataFrame([draw(var, N) for var in x.vars], [var.name for var in x.vars])
end
function draw(x::MetaDataset)
return draw(x, x.N)
end
function print(x::MetaDataset)
println("N: ", x.N)
println("Vars:")
for var in x.vars
println(" - ", var.name, " | ", var.dist, " (missing: ", var.p_missing, ")")
end
end
# generate some data from distributions and infer MetaDataset
mds = MetaDataset(100, [
MetaVariable("NormalVar", 0.3, Normal(2, 1)),
MetaVariable("LogNormalVar", 0.0, LogNormal(5, 3)),
MetaVariable("Categorical", 0.2, Categorical([.1, .3, .3, .2, .1]))
])
df = draw(mds)
mds_fitted = MetaDataset(df)
print(mds_fitted)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment