mason-stewart · July 9, 2018 20:19
diff --git a/analyzer.r b/analyzer.r
 # sentio model v3
 #   use:
 #     Sentio gives a clients best employees the test
 #     Those employes are compared to previous sentio test takers
 #     to determine which characteristics are most similar to
 #     best employees.
 #   output:
 #     1) output scores (0-10) for total and for each input variable
 #     for the "best" employees and the employees which need to be scored
 #     2) parameters from regression
 #     3) scaling parameters
 #   diagnostics:
 #     provided in a non-production file (sentio_diagnostics.r)
 #

 # Data file requirements
 #  d: file of example people and example target, see training_data_v2.csv
 #  Wordcount all greater than 100
 #  No NA/Missing scores.
 #  Top:
 #   1, if one of the targets,
 #   0, if example data,
 #   -1, if it should be excluded from the regression but scored on the resulting model

 # Estimation process
 #
 # Use target and example population Top %in% c(0,1)
 #  1. Target variable:  1 if in target, -1 if not in target
 #
 #  2. For each explanatory variable, calculate empirical distribution and label each observation
 #  with its empirical percentile (will be between 0 and 1 inclusive)
 #
 #  3. Standardize explanatory variables by subtracting mean (0.5) and dividiving by sd (always .2887ish for uniform [0,1])
 #
 #  4. regress target on each variable independently without an intercept
 #
 #  5. Generate predictions from each variable by multiplying the coefficient by the score for each individual
 #
 #  6. Find the mean and sd of all the scores (across all people and variables)
 #    standardize: z = (x-mean)/sd
 #    scale:  pnorm(z)*10 to convert it to a score between 0 and 1
 #    Doing this across all scores allows you to see easily that a 10 is really good and a 0 is really bad
 #    the weighting of the variable is implied by the score
 #
 #  7 Do the same scaling (0-10)for the final score

 # Start with a clean environment
 rm(list=ls(all=T))

 # For the %>% (pipe) operator
 library(magrittr)

 # Not used at the moment
 app_dir <- getwd()

 args <- commandArgs(trailingOnly = TRUE)
 inputFile <- args[1]
 outputFile <- args[2]

 # For some manipulation functions
 library(tidyr)
 library(dplyr)

 # Read in training data
 d = read.csv(inputFile)

 use_in_estimation = ifelse(d$Top %in% c(0,1),1,0)
 target = ifelse(use_in_estimation,2*d$Top-1,0)
 watson_vars = colnames(d[6:length(colnames(d))])

 var_mean = apply(d[,watson_vars],2,"mean")
 var_sd = pmax(apply(d[,watson_vars],2,"sd"),.20)
 var_stats = cbind(Mean = var_mean,SD = var_sd)

 z_scorer = function(x){
  z = ecdf(x)(x)
  z = (z -.5)/.2887
  return(z)
 }

 # Y needs to be a dataset of all of the variables
 # with columns Mean and SD
 f = function(x,y=var_stats){(x - y[,"Mean"])/y[,"SD"]}
 raw_scores = d[,c(6:length(d))]
 z_score = apply(raw_scores,2,z_scorer) %>% as.data.frame
 colnames(z_score) = colnames(raw_scores)

 # Zero out variables which are bad
 # Bad optics, plus the score as little variation

 #z_score = sign(z_score)*pmin(abs(z_score),2.0) %>% as.data.frame

 #z_data=z_score[,1]
 ## target = d$Top
 estimation = function(z_data,target){
   dat = cbind(x=z_data,y=target) %>% as.data.frame
   #r = glm(y~0+x,data=dat,family = binomial(link="logit"))
   r = lm(y~0+x,data=dat)
   r2 = summary(r)$r.squared
   result = c(r$coef,r2)
   names(result) = NULL
   return(result)
 }

 target_var = d$Top[use_in_estimation==1]*2-1
 x_vars = z_score[use_in_estimation==1,]
 regression_params = apply(x_vars,2,
            estimation,
            target=target_var) %>% as.data.frame
 betas = regression_params[1,] %>% unlist
 rsq = regression_params[2,] %>% unlist
 r = rsq^(1/2)
 preds = apply(z_score,1,function(x){ ( x * betas)}) %>% t %>% as.data.frame

 stage1_mean = mean(preds %>% unlist)
 stage1_sd = sd(preds %>% unlist)*1.5

 preds_normalized = apply(preds,1,function(x){(x - stage1_mean)/stage1_sd}) %>% t %>% as.data.frame

 params_regression = cbind(
  betas = betas,
  rsq = rsq
 ) %>% as.data.frame
 rownames(params_regression) = names(betas)


 total = rowSums(preds_normalized)/ncol(preds_normalized)
 total_mean = mean(total)
 total_sd = sd(total)

 params_scaling = cbind(
  stage1_mean = stage1_mean,
  stage1_sd = stage1_sd,
  total_mean = total_mean,
  total_sd = total_sd
 ) %>% as.data.frame

 total_normalized = (total - total_mean)/total_sd

 total_score = round(pnorm(total_normalized)*10,1)

 preds_score = round(pnorm(as.matrix(preds_normalized))*10,1)

 output_scores = cbind(d[,c("Id", "Name","Top","Department","WordCount")],Total = total_score,preds_score) %>%
  filter(Top %in% c(1,-1))

 write.csv(
  output_scores,file=outputFile
 )

 # alternative output format, not currently used.
 # ask Bill West for more details
 # write.csv(
 #   params_regression[sort.list(params_regression$rsq,decreasing=T),],file="output_params_regression.csv"
 # )
 # write.csv(
 #   params_scaling,file="output_params_scaling.csv"
 # )

diff --git a/input.csv b/input.csv
	# sentio model v3
	# use:
	# Sentio gives a clients best employees the test
	# Those employes are compared to previous sentio test takers
	# to determine which characteristics are most similar to
	# best employees.
	# output:
	# 1) output scores (0-10) for total and for each input variable
	# for the "best" employees and the employees which need to be scored
	# 2) parameters from regression
	# 3) scaling parameters
	# diagnostics:
	# provided in a non-production file (sentio_diagnostics.r)
	#

	# Data file requirements
	# d: file of example people and example target, see training_data_v2.csv
	# Wordcount all greater than 100
	# No NA/Missing scores.
	# Top:
	# 1, if one of the targets,
	# 0, if example data,
	# -1, if it should be excluded from the regression but scored on the resulting model

	# Estimation process
	#
	# Use target and example population Top %in% c(0,1)
	# 1. Target variable: 1 if in target, -1 if not in target
	#
	# 2. For each explanatory variable, calculate empirical distribution and label each observation
	# with its empirical percentile (will be between 0 and 1 inclusive)
	#
	# 3. Standardize explanatory variables by subtracting mean (0.5) and dividiving by sd (always .2887ish for uniform [0,1])
	#
	# 4. regress target on each variable independently without an intercept
	#
	# 5. Generate predictions from each variable by multiplying the coefficient by the score for each individual
	#
	# 6. Find the mean and sd of all the scores (across all people and variables)
	# standardize: z = (x-mean)/sd
	# scale: pnorm(z)*10 to convert it to a score between 0 and 1
	# Doing this across all scores allows you to see easily that a 10 is really good and a 0 is really bad
	# the weighting of the variable is implied by the score
	#
	# 7 Do the same scaling (0-10)for the final score

	# Start with a clean environment
	rm(list=ls(all=T))

	# For the %>% (pipe) operator
	library(magrittr)

	# Not used at the moment
	app_dir <- getwd()

	args <- commandArgs(trailingOnly = TRUE)
	inputFile <- args[1]
	outputFile <- args[2]

	# For some manipulation functions
	library(tidyr)
	library(dplyr)

	# Read in training data
	d = read.csv(inputFile)

	use_in_estimation = ifelse(d$Top %in% c(0,1),1,0)
	target = ifelse(use_in_estimation,2*d$Top-1,0)
	watson_vars = colnames(d[6:length(colnames(d))])

	var_mean = apply(d[,watson_vars],2,"mean")
	var_sd = pmax(apply(d[,watson_vars],2,"sd"),.20)
	var_stats = cbind(Mean = var_mean,SD = var_sd)

	z_scorer = function(x){
	z = ecdf(x)(x)
	z = (z -.5)/.2887
	return(z)
	}

	# Y needs to be a dataset of all of the variables
	# with columns Mean and SD
	f = function(x,y=var_stats){(x - y[,"Mean"])/y[,"SD"]}
	raw_scores = d[,c(6:length(d))]
	z_score = apply(raw_scores,2,z_scorer) %>% as.data.frame
	colnames(z_score) = colnames(raw_scores)

	# Zero out variables which are bad
	# Bad optics, plus the score as little variation

	#z_score = sign(z_score)*pmin(abs(z_score),2.0) %>% as.data.frame

	#z_data=z_score[,1]
	## target = d$Top
	estimation = function(z_data,target){
	dat = cbind(x=z_data,y=target) %>% as.data.frame
	#r = glm(y~0+x,data=dat,family = binomial(link="logit"))
	r = lm(y~0+x,data=dat)
	r2 = summary(r)$r.squared
	result = c(r$coef,r2)
	names(result) = NULL
	return(result)
	}

	target_var = d$Top[use_in_estimation==1]*2-1
	x_vars = z_score[use_in_estimation==1,]
	regression_params = apply(x_vars,2,
	estimation,
	target=target_var) %>% as.data.frame
	betas = regression_params[1,] %>% unlist
	rsq = regression_params[2,] %>% unlist
	r = rsq^(1/2)
	preds = apply(z_score,1,function(x){ ( x * betas)}) %>% t %>% as.data.frame

	stage1_mean = mean(preds %>% unlist)
	stage1_sd = sd(preds %>% unlist)*1.5

	preds_normalized = apply(preds,1,function(x){(x - stage1_mean)/stage1_sd}) %>% t %>% as.data.frame

	params_regression = cbind(
	betas = betas,
	rsq = rsq
	) %>% as.data.frame
	rownames(params_regression) = names(betas)


	total = rowSums(preds_normalized)/ncol(preds_normalized)
	total_mean = mean(total)
	total_sd = sd(total)

	params_scaling = cbind(
	stage1_mean = stage1_mean,
	stage1_sd = stage1_sd,
	total_mean = total_mean,
	total_sd = total_sd
	) %>% as.data.frame

	total_normalized = (total - total_mean)/total_sd

	total_score = round(pnorm(total_normalized)*10,1)

	preds_score = round(pnorm(as.matrix(preds_normalized))*10,1)

	output_scores = cbind(d[,c("Id", "Name","Top","Department","WordCount")],Total = total_score,preds_score) %>%
	filter(Top %in% c(1,-1))

	write.csv(
	output_scores,file=outputFile
	)

	# alternative output format, not currently used.
	# ask Bill West for more details
	# write.csv(
	# params_regression[sort.list(params_regression$rsq,decreasing=T),],file="output_params_regression.csv"
	# )
	# write.csv(
	# params_scaling,file="output_params_scaling.csv"
	# )
	Id	Name	Top	Department	WordCount	openness	adventurousness	artistic_interests	emotionality	imagination	intellect	liberalism	conscientiousness	achievement_striving	cautiousness	dutifulness	orderliness	self_discipline	self_efficacy	extraversion	activity_level	assertiveness	cheerfulness	excitement_seeking	friendliness	gregariousness	agreeableness	altruism	cooperation	modesty	morality	sympathy	trust	neuroticism	anger	anxiety	depression	immoderation	self_consciousness	vulnerability	challenge	closeness	curiosity	excitement	harmony	ideal	liberty	love	practicality	self_expression	stability	structure	conservation	openness_to_change	hedonism	self_enhancement	self_transcendence
	7e1416bf-b1db-456e-aec2-3215a384825c	AJ Richichi	-1	Entry-Level Sales	828	0.98	0.668	0.346	0.094	0.434	0.97	0.72	0.875	0.981	0.904	0.421	0.332	0.873	0.971	0.985	0.989	0.988	0.357	0.162	0.427	0.234	0.213	0.565	0.462	0.038	0.538	0.572	0.815	0.963	0.286	0.102	0.434	0.134	0.211	0.057	0.512	0.039	0.574	0.113	0.06	0.269	0.233	0.095	0.497	0.184	0.225	0.761	0.037	0.431	0.019	0.275	0.149