Created
July 9, 2018 20:19
-
-
Save mason-stewart/d359fece05eeb2172e188da1bed590fe to your computer and use it in GitHub Desktop.
This generates the following error:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# sentio model v3 | |
# use: | |
# Sentio gives a clients best employees the test | |
# Those employes are compared to previous sentio test takers | |
# to determine which characteristics are most similar to | |
# best employees. | |
# output: | |
# 1) output scores (0-10) for total and for each input variable | |
# for the "best" employees and the employees which need to be scored | |
# 2) parameters from regression | |
# 3) scaling parameters | |
# diagnostics: | |
# provided in a non-production file (sentio_diagnostics.r) | |
# | |
# Data file requirements | |
# d: file of example people and example target, see training_data_v2.csv | |
# Wordcount all greater than 100 | |
# No NA/Missing scores. | |
# Top: | |
# 1, if one of the targets, | |
# 0, if example data, | |
# -1, if it should be excluded from the regression but scored on the resulting model | |
# Estimation process | |
# | |
# Use target and example population Top %in% c(0,1) | |
# 1. Target variable: 1 if in target, -1 if not in target | |
# | |
# 2. For each explanatory variable, calculate empirical distribution and label each observation | |
# with its empirical percentile (will be between 0 and 1 inclusive) | |
# | |
# 3. Standardize explanatory variables by subtracting mean (0.5) and dividiving by sd (always .2887ish for uniform [0,1]) | |
# | |
# 4. regress target on each variable independently without an intercept | |
# | |
# 5. Generate predictions from each variable by multiplying the coefficient by the score for each individual | |
# | |
# 6. Find the mean and sd of all the scores (across all people and variables) | |
# standardize: z = (x-mean)/sd | |
# scale: pnorm(z)*10 to convert it to a score between 0 and 1 | |
# Doing this across all scores allows you to see easily that a 10 is really good and a 0 is really bad | |
# the weighting of the variable is implied by the score | |
# | |
# 7 Do the same scaling (0-10)for the final score | |
# Start with a clean environment | |
rm(list=ls(all=T)) | |
# For the %>% (pipe) operator | |
library(magrittr) | |
# Not used at the moment | |
app_dir <- getwd() | |
args <- commandArgs(trailingOnly = TRUE) | |
inputFile <- args[1] | |
outputFile <- args[2] | |
# For some manipulation functions | |
library(tidyr) | |
library(dplyr) | |
# Read in training data | |
d = read.csv(inputFile) | |
use_in_estimation = ifelse(d$Top %in% c(0,1),1,0) | |
target = ifelse(use_in_estimation,2*d$Top-1,0) | |
watson_vars = colnames(d[6:length(colnames(d))]) | |
var_mean = apply(d[,watson_vars],2,"mean") | |
var_sd = pmax(apply(d[,watson_vars],2,"sd"),.20) | |
var_stats = cbind(Mean = var_mean,SD = var_sd) | |
z_scorer = function(x){ | |
z = ecdf(x)(x) | |
z = (z -.5)/.2887 | |
return(z) | |
} | |
# Y needs to be a dataset of all of the variables | |
# with columns Mean and SD | |
f = function(x,y=var_stats){(x - y[,"Mean"])/y[,"SD"]} | |
raw_scores = d[,c(6:length(d))] | |
z_score = apply(raw_scores,2,z_scorer) %>% as.data.frame | |
colnames(z_score) = colnames(raw_scores) | |
# Zero out variables which are bad | |
# Bad optics, plus the score as little variation | |
#z_score = sign(z_score)*pmin(abs(z_score),2.0) %>% as.data.frame | |
#z_data=z_score[,1] | |
## target = d$Top | |
estimation = function(z_data,target){ | |
dat = cbind(x=z_data,y=target) %>% as.data.frame | |
#r = glm(y~0+x,data=dat,family = binomial(link="logit")) | |
r = lm(y~0+x,data=dat) | |
r2 = summary(r)$r.squared | |
result = c(r$coef,r2) | |
names(result) = NULL | |
return(result) | |
} | |
target_var = d$Top[use_in_estimation==1]*2-1 | |
x_vars = z_score[use_in_estimation==1,] | |
regression_params = apply(x_vars,2, | |
estimation, | |
target=target_var) %>% as.data.frame | |
betas = regression_params[1,] %>% unlist | |
rsq = regression_params[2,] %>% unlist | |
r = rsq^(1/2) | |
preds = apply(z_score,1,function(x){ ( x * betas)}) %>% t %>% as.data.frame | |
stage1_mean = mean(preds %>% unlist) | |
stage1_sd = sd(preds %>% unlist)*1.5 | |
preds_normalized = apply(preds,1,function(x){(x - stage1_mean)/stage1_sd}) %>% t %>% as.data.frame | |
params_regression = cbind( | |
betas = betas, | |
rsq = rsq | |
) %>% as.data.frame | |
rownames(params_regression) = names(betas) | |
total = rowSums(preds_normalized)/ncol(preds_normalized) | |
total_mean = mean(total) | |
total_sd = sd(total) | |
params_scaling = cbind( | |
stage1_mean = stage1_mean, | |
stage1_sd = stage1_sd, | |
total_mean = total_mean, | |
total_sd = total_sd | |
) %>% as.data.frame | |
total_normalized = (total - total_mean)/total_sd | |
total_score = round(pnorm(total_normalized)*10,1) | |
preds_score = round(pnorm(as.matrix(preds_normalized))*10,1) | |
output_scores = cbind(d[,c("Id", "Name","Top","Department","WordCount")],Total = total_score,preds_score) %>% | |
filter(Top %in% c(1,-1)) | |
write.csv( | |
output_scores,file=outputFile | |
) | |
# alternative output format, not currently used. | |
# ask Bill West for more details | |
# write.csv( | |
# params_regression[sort.list(params_regression$rsq,decreasing=T),],file="output_params_regression.csv" | |
# ) | |
# write.csv( | |
# params_scaling,file="output_params_scaling.csv" | |
# ) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Id | Name | Top | Department | WordCount | openness | adventurousness | artistic_interests | emotionality | imagination | intellect | liberalism | conscientiousness | achievement_striving | cautiousness | dutifulness | orderliness | self_discipline | self_efficacy | extraversion | activity_level | assertiveness | cheerfulness | excitement_seeking | friendliness | gregariousness | agreeableness | altruism | cooperation | modesty | morality | sympathy | trust | neuroticism | anger | anxiety | depression | immoderation | self_consciousness | vulnerability | challenge | closeness | curiosity | excitement | harmony | ideal | liberty | love | practicality | self_expression | stability | structure | conservation | openness_to_change | hedonism | self_enhancement | self_transcendence | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7e1416bf-b1db-456e-aec2-3215a384825c | AJ Richichi | -1 | Entry-Level Sales | 828 | 0.98 | 0.668 | 0.346 | 0.094 | 0.434 | 0.97 | 0.72 | 0.875 | 0.981 | 0.904 | 0.421 | 0.332 | 0.873 | 0.971 | 0.985 | 0.989 | 0.988 | 0.357 | 0.162 | 0.427 | 0.234 | 0.213 | 0.565 | 0.462 | 0.038 | 0.538 | 0.572 | 0.815 | 0.963 | 0.286 | 0.102 | 0.434 | 0.134 | 0.211 | 0.057 | 0.512 | 0.039 | 0.574 | 0.113 | 0.06 | 0.269 | 0.233 | 0.095 | 0.497 | 0.184 | 0.225 | 0.761 | 0.037 | 0.431 | 0.019 | 0.275 | 0.149 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment