|
library(data.table) |
|
library(taxstats) |
|
library(dplyr) |
|
|
|
|
|
## How are the sample file columns related to each other? |
|
|
|
# http://stackoverflow.com/questions/13088770/how-to-write-linearly-dependent-column-in-a-matrix-in-terms-of-linearly-independ |
|
linfinder <- function(mat){ |
|
# If the matrix is full rank then we're done |
|
if(qr(mat)$rank == ncol(mat)){ |
|
print("Matrix is of full rank") |
|
return(invisible(seq(ncol(mat)))) |
|
} |
|
m <- ncol(mat) |
|
# cols keeps track of which columns are linearly independent |
|
cols <- 1 |
|
for(i in seq(2, m)){ |
|
ids <- c(cols, i) |
|
mymat <- mat[, ids] |
|
if(qr(mymat)$rank != length(ids)){ |
|
# Regression the column of interest on the previous |
|
# columns to figure out the relationship |
|
o <- lm(mat[,i] ~ mat[,cols] + 0) |
|
# Construct the output message |
|
start <- names(numeric.sample.file)[i] |
|
# Which coefs are nonzero |
|
nz <- !(abs(coef(o)) <= .Machine$double.eps^0.5) |
|
tmp <- paste("Column", cols[nz], sep = "_") |
|
tmp <- names(numeric.sample.file)[nz] |
|
coef.o <- round(coef(o), 2) |
|
vals <- paste(coef.o[nz], tmp, sep = "*", collapse = " + ") |
|
message <- paste0(start, " = ", vals) |
|
print(message) |
|
} else { |
|
# If the matrix subset was of full rank |
|
# then the newest column in linearly independent |
|
# so add it to the cols list |
|
cols <- ids |
|
} |
|
} |
|
return(invisible(cols)) |
|
} |
|
|
|
numeric.sample.file <- |
|
sample_file_1213[ , which(sapply(sample_file_1213, is.numeric)), with = FALSE] %>% |
|
filter(complete.cases(.)) |
|
linfinder(data.matrix(numeric.sample.file)) |
|
|
|
# [1] "Rent_cap_wks_amt = -1*Net_rent_amt + 1*Gross_rent_amt + -1*Other_rent_ded_amt + -1*Rent_int_ded_amt + -1*Other_inc_amt + 1*Tot_inc_amt + -1*WRE_car_amt + -1*WRE_trvl_amt + -1*Spouse_adjusted_taxable_inc + 1*Net_fincl_invstmt_lss_amt + -1*Rptbl_Empr_spr_cont_amt + -1*Cr_PAYG_ITI_amt" |
|
# [1] "Tot_inc_amt = 1*Sw_amt + 1*Alow_ben_amt + 1*ETP_txbl_amt + 1*Grs_int_amt + 1*Aust_govt_pnsn_allw_amt + 1*Unfranked_Div_amt + 1*Frk_Div_amt + 1*Dividends_franking_cr_amt + 1*Net_rent_amt + 1*Rent_cap_wks_amt + 1*Net_farm_management_amt + 1*Net_PP_BI_amt + 1*Total_NPP_BE_amt + 1*Tot_CY_CG_amt + 1*Net_PT_PP_dsn + 1*Net_PT_NPP_dsn + 1*Taxed_othr_pnsn_amt + 1*Untaxed_othr_pnsn_amt + 1*Other_foreign_inc_amt + 1*Div_Ded_amt + 1*Intrst_Ded_amt + 1*Gift_amt + 1*Non_emp_spr_amt + 1*Cost_tax_affairs_amt + 1*Other_Ded_amt + 1*Tot_ded_amt + 1*PP_loss_claimed + 1*NPP_loss_claimed + 1*Spouse_adjusted_taxable_inc + 1*Net_fincl_invstmt_lss_amt + 1*Rptbl_Empr_spr_cont_amt + 1*Taxable_Income" |
|
# [1] "Tot_ded_amt = 1*Other_inc_amt + 1*Tot_inc_amt + 1*WRE_car_amt + 1*WRE_trvl_amt + 1*WRE_uniform_amt + 1*WRE_self_amt + 1*WRE_other_amt + 1*Div_Ded_amt + 1*Intrst_Ded_amt + 1*Gift_amt + 1*Non_emp_spr_amt" |
|
|
|
|
|
library(randomForest) |
|
library(dplyr) |
|
library(magrittr) |
|
library(parallel) |
|
library(foreach) |
|
|
|
numCores <- detectCores() |
|
cluster <- makePSOCKcluster(cores) |
|
|
|
|
|
# which columns have any NAs (and so are unsuitable for RF) |
|
nNA.cols <- names(sample_file_1213)[sapply(sample_file_1213, function(x){!any(is.na(x))})] |
|
|
|
num_trees = 100L |
|
trees_per_core = floor(num_trees / numCores) |
|
|
|
train <- |
|
sample_file_1213 %>% |
|
sample_n(20000) %>% |
|
select_(.dots = nNA.cols) %>% |
|
mutate(Correct = factor(Taxable_Income == Tot_inc_amt - Tot_ded_amt - PP_loss_claimed - NPP_loss_claimed)) %>% |
|
select(-c(Ind, Taxable_Income, Tot_inc_amt, Tot_ded_amt)) |
|
|
|
|
|
|
|
RF <- |
|
foreach(trees=rep(trees_per_core, numCores), .combine = randomForest::combine, .multicombine = TRUE) %dopar% { |
|
randomForest(Correct ~ ., data = train, importance = TRUE, ntree = trees) |
|
} |
|
|
|
sample_file_1213 %>% |
|
sample_n(50000) %>% |
|
select_(.dots = nNA.cols) %>% |
|
mutate(Correct = factor(Taxable_Income != Tot_inc_amt - Tot_ded_amt - PP_loss_claimed - NPP_loss_claimed)) %>% |
|
select(-c(Ind, Taxable_Income, Tot_inc_amt, Tot_ded_amt)) %>% |
|
mutate(predicted = predict(RF, newdata = .)) %>% |
|
select(Correct, predicted) %>% |
|
count(Correct, predicted) |
|
|
|
|
|
importance(RF) %>% |
|
broom::tidy() %>% |
|
arrange(desc(MeanDecreaseAccuracy)) %>% |
|
mutate(x = factor(.rownames, levels = .$.rownames)) %>% |
|
ggplot(aes(x = x, y = MeanDecreaseAccuracy)) + |
|
geom_bar(stat = "identity", width = 0.9) + |
|
coord_flip() |
|
|
|
sample_file_1213 %>% |
|
select_(.dots = names(.)[sapply(., is.numeric)]) %>% |
|
group_by(Correct = (Taxable_Income != Tot_inc_amt - Tot_ded_amt - PP_loss_claimed - NPP_loss_claimed)) %>% |
|
summarise_each(funs(mean)) %>% |
|
melt(id.vars = "Correct") %>% |
|
group_by(variable) %>% |
|
summarise(ratio = first(value) / last(value)) %>% |
|
ungroup %>% |
|
arrange(ratio) %>% |
|
as.data.table |