Instantly share code, notes, and snippets.
Last active
April 22, 2024 00:05
-
Star
(1)
1
You must be signed in to star a gist -
Fork
(0)
0
You must be signed in to fork a gist
-
Save HughParsonage/4a3ad646560e0b6218ad to your computer and use it in GitHub Desktop.
How are the columns of the ATO's 2% sample file related?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
library(taxstats) | |
library(dplyr) | |
## How are the sample file columns related to each other? | |
# http://stackoverflow.com/questions/13088770/how-to-write-linearly-dependent-column-in-a-matrix-in-terms-of-linearly-independ | |
linfinder <- function(mat){ | |
# If the matrix is full rank then we're done | |
if(qr(mat)$rank == ncol(mat)){ | |
print("Matrix is of full rank") | |
return(invisible(seq(ncol(mat)))) | |
} | |
m <- ncol(mat) | |
# cols keeps track of which columns are linearly independent | |
cols <- 1 | |
for(i in seq(2, m)){ | |
ids <- c(cols, i) | |
mymat <- mat[, ids] | |
if(qr(mymat)$rank != length(ids)){ | |
# Regression the column of interest on the previous | |
# columns to figure out the relationship | |
o <- lm(mat[,i] ~ mat[,cols] + 0) | |
# Construct the output message | |
start <- names(numeric.sample.file)[i] | |
# Which coefs are nonzero | |
nz <- !(abs(coef(o)) <= .Machine$double.eps^0.5) | |
tmp <- paste("Column", cols[nz], sep = "_") | |
tmp <- names(numeric.sample.file)[nz] | |
coef.o <- round(coef(o), 2) | |
vals <- paste(coef.o[nz], tmp, sep = "*", collapse = " + ") | |
message <- paste0(start, " = ", vals) | |
print(message) | |
} else { | |
# If the matrix subset was of full rank | |
# then the newest column in linearly independent | |
# so add it to the cols list | |
cols <- ids | |
} | |
} | |
return(invisible(cols)) | |
} | |
numeric.sample.file <- | |
sample_file_1213[ , which(sapply(sample_file_1213, is.numeric)), with = FALSE] %>% | |
filter(complete.cases(.)) | |
linfinder(data.matrix(numeric.sample.file)) | |
# [1] "Rent_cap_wks_amt = -1*Net_rent_amt + 1*Gross_rent_amt + -1*Other_rent_ded_amt + -1*Rent_int_ded_amt + -1*Other_inc_amt + 1*Tot_inc_amt + -1*WRE_car_amt + -1*WRE_trvl_amt + -1*Spouse_adjusted_taxable_inc + 1*Net_fincl_invstmt_lss_amt + -1*Rptbl_Empr_spr_cont_amt + -1*Cr_PAYG_ITI_amt" | |
# [1] "Tot_inc_amt = 1*Sw_amt + 1*Alow_ben_amt + 1*ETP_txbl_amt + 1*Grs_int_amt + 1*Aust_govt_pnsn_allw_amt + 1*Unfranked_Div_amt + 1*Frk_Div_amt + 1*Dividends_franking_cr_amt + 1*Net_rent_amt + 1*Rent_cap_wks_amt + 1*Net_farm_management_amt + 1*Net_PP_BI_amt + 1*Total_NPP_BE_amt + 1*Tot_CY_CG_amt + 1*Net_PT_PP_dsn + 1*Net_PT_NPP_dsn + 1*Taxed_othr_pnsn_amt + 1*Untaxed_othr_pnsn_amt + 1*Other_foreign_inc_amt + 1*Div_Ded_amt + 1*Intrst_Ded_amt + 1*Gift_amt + 1*Non_emp_spr_amt + 1*Cost_tax_affairs_amt + 1*Other_Ded_amt + 1*Tot_ded_amt + 1*PP_loss_claimed + 1*NPP_loss_claimed + 1*Spouse_adjusted_taxable_inc + 1*Net_fincl_invstmt_lss_amt + 1*Rptbl_Empr_spr_cont_amt + 1*Taxable_Income" | |
# [1] "Tot_ded_amt = 1*Other_inc_amt + 1*Tot_inc_amt + 1*WRE_car_amt + 1*WRE_trvl_amt + 1*WRE_uniform_amt + 1*WRE_self_amt + 1*WRE_other_amt + 1*Div_Ded_amt + 1*Intrst_Ded_amt + 1*Gift_amt + 1*Non_emp_spr_amt" | |
library(randomForest) | |
library(dplyr) | |
library(magrittr) | |
library(parallel) | |
library(foreach) | |
numCores <- detectCores() | |
cluster <- makePSOCKcluster(cores) | |
# which columns have any NAs (and so are unsuitable for RF) | |
nNA.cols <- names(sample_file_1213)[sapply(sample_file_1213, function(x){!any(is.na(x))})] | |
num_trees = 100L | |
trees_per_core = floor(num_trees / numCores) | |
train <- | |
sample_file_1213 %>% | |
sample_n(20000) %>% | |
select_(.dots = nNA.cols) %>% | |
mutate(Correct = factor(Taxable_Income == Tot_inc_amt - Tot_ded_amt - PP_loss_claimed - NPP_loss_claimed)) %>% | |
select(-c(Ind, Taxable_Income, Tot_inc_amt, Tot_ded_amt)) | |
RF <- | |
foreach(trees=rep(trees_per_core, numCores), .combine = randomForest::combine, .multicombine = TRUE) %dopar% { | |
randomForest(Correct ~ ., data = train, importance = TRUE, ntree = trees) | |
} | |
sample_file_1213 %>% | |
sample_n(50000) %>% | |
select_(.dots = nNA.cols) %>% | |
mutate(Correct = factor(Taxable_Income != Tot_inc_amt - Tot_ded_amt - PP_loss_claimed - NPP_loss_claimed)) %>% | |
select(-c(Ind, Taxable_Income, Tot_inc_amt, Tot_ded_amt)) %>% | |
mutate(predicted = predict(RF, newdata = .)) %>% | |
select(Correct, predicted) %>% | |
count(Correct, predicted) | |
importance(RF) %>% | |
broom::tidy() %>% | |
arrange(desc(MeanDecreaseAccuracy)) %>% | |
mutate(x = factor(.rownames, levels = .$.rownames)) %>% | |
ggplot(aes(x = x, y = MeanDecreaseAccuracy)) + | |
geom_bar(stat = "identity", width = 0.9) + | |
coord_flip() | |
sample_file_1213 %>% | |
select_(.dots = names(.)[sapply(., is.numeric)]) %>% | |
group_by(Correct = (Taxable_Income != Tot_inc_amt - Tot_ded_amt - PP_loss_claimed - NPP_loss_claimed)) %>% | |
summarise_each(funs(mean)) %>% | |
melt(id.vars = "Correct") %>% | |
group_by(variable) %>% | |
summarise(ratio = first(value) / last(value)) %>% | |
ungroup %>% | |
arrange(ratio) %>% | |
as.data.table |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment