HughParsonage · April 22, 2024 00:05
diff --git a/how_ATO_sample_cols_are_related.R b/how_ATO_sample_cols_are_related.R
 library(data.table)
 library(taxstats)
 library(dplyr)


 ## How are the sample file columns related to each other?

 # http://stackoverflow.com/questions/13088770/how-to-write-linearly-dependent-column-in-a-matrix-in-terms-of-linearly-independ
 linfinder <- function(mat){
  # If the matrix is full rank then we're done
  if(qr(mat)$rank == ncol(mat)){
    print("Matrix is of full rank")
    return(invisible(seq(ncol(mat))))
  }
  m <- ncol(mat)
  # cols keeps track of which columns are linearly independent
  cols <- 1
  for(i in seq(2, m)){
    ids <- c(cols, i)
    mymat <- mat[, ids]
    if(qr(mymat)$rank != length(ids)){
      # Regression the column of interest on the previous
      # columns to figure out the relationship
      o <- lm(mat[,i] ~ mat[,cols] + 0)
      # Construct the output message
      start <- names(numeric.sample.file)[i]
      # Which coefs are nonzero
      nz <- !(abs(coef(o)) <= .Machine$double.eps^0.5)
      tmp <- paste("Column", cols[nz], sep = "_")
      tmp <- names(numeric.sample.file)[nz]
      coef.o <- round(coef(o), 2)
      vals <- paste(coef.o[nz], tmp, sep = "*", collapse = " + ")
      message <- paste0(start, " = ", vals)
      print(message)
    } else {
      # If the matrix subset was of full rank
      # then the newest column in linearly independent
      # so add it to the cols list
      cols <- ids
    }
  }
  return(invisible(cols))
 }

 numeric.sample.file <- 
  sample_file_1213[ , which(sapply(sample_file_1213, is.numeric)), with = FALSE] %>%
  filter(complete.cases(.))
 linfinder(data.matrix(numeric.sample.file))

 # [1] "Rent_cap_wks_amt = -1*Net_rent_amt + 1*Gross_rent_amt + -1*Other_rent_ded_amt + -1*Rent_int_ded_amt + -1*Other_inc_amt + 1*Tot_inc_amt + -1*WRE_car_amt + -1*WRE_trvl_amt + -1*Spouse_adjusted_taxable_inc + 1*Net_fincl_invstmt_lss_amt + -1*Rptbl_Empr_spr_cont_amt + -1*Cr_PAYG_ITI_amt"
 # [1] "Tot_inc_amt = 1*Sw_amt + 1*Alow_ben_amt + 1*ETP_txbl_amt + 1*Grs_int_amt + 1*Aust_govt_pnsn_allw_amt + 1*Unfranked_Div_amt + 1*Frk_Div_amt + 1*Dividends_franking_cr_amt + 1*Net_rent_amt + 1*Rent_cap_wks_amt + 1*Net_farm_management_amt + 1*Net_PP_BI_amt + 1*Total_NPP_BE_amt + 1*Tot_CY_CG_amt + 1*Net_PT_PP_dsn + 1*Net_PT_NPP_dsn + 1*Taxed_othr_pnsn_amt + 1*Untaxed_othr_pnsn_amt + 1*Other_foreign_inc_amt + 1*Div_Ded_amt + 1*Intrst_Ded_amt + 1*Gift_amt + 1*Non_emp_spr_amt + 1*Cost_tax_affairs_amt + 1*Other_Ded_amt + 1*Tot_ded_amt + 1*PP_loss_claimed + 1*NPP_loss_claimed + 1*Spouse_adjusted_taxable_inc + 1*Net_fincl_invstmt_lss_amt + 1*Rptbl_Empr_spr_cont_amt + 1*Taxable_Income"
 # [1] "Tot_ded_amt = 1*Other_inc_amt + 1*Tot_inc_amt + 1*WRE_car_amt + 1*WRE_trvl_amt + 1*WRE_uniform_amt + 1*WRE_self_amt + 1*WRE_other_amt + 1*Div_Ded_amt + 1*Intrst_Ded_amt + 1*Gift_amt + 1*Non_emp_spr_amt"


 library(randomForest)
 library(dplyr)
 library(magrittr)
 library(parallel)
 library(foreach)

 numCores <- detectCores()
 cluster <- makePSOCKcluster(cores)


 # which columns have any NAs (and so are unsuitable for RF)
 nNA.cols <- names(sample_file_1213)[sapply(sample_file_1213, function(x){!any(is.na(x))})]

 num_trees = 100L
 trees_per_core = floor(num_trees / numCores)

 train <-  
  sample_file_1213 %>%
  sample_n(20000) %>%
  select_(.dots = nNA.cols) %>%
  mutate(Correct = factor(Taxable_Income == Tot_inc_amt - Tot_ded_amt - PP_loss_claimed - NPP_loss_claimed)) %>%
  select(-c(Ind, Taxable_Income, Tot_inc_amt, Tot_ded_amt)) 



 RF <- 
  foreach(trees=rep(trees_per_core, numCores), .combine = randomForest::combine, .multicombine = TRUE) %dopar% {
    randomForest(Correct ~ ., data = train, importance = TRUE, ntree = trees)
  }

 sample_file_1213 %>%
  sample_n(50000) %>%
  select_(.dots = nNA.cols) %>%
  mutate(Correct = factor(Taxable_Income != Tot_inc_amt - Tot_ded_amt - PP_loss_claimed - NPP_loss_claimed)) %>%
  select(-c(Ind, Taxable_Income, Tot_inc_amt, Tot_ded_amt))  %>%
  mutate(predicted = predict(RF, newdata = .)) %>% 
  select(Correct, predicted) %>%
  count(Correct, predicted)


 importance(RF) %>%
  broom::tidy() %>%
  arrange(desc(MeanDecreaseAccuracy)) %>%
  mutate(x = factor(.rownames, levels = .$.rownames)) %>%
  ggplot(aes(x = x, y = MeanDecreaseAccuracy)) + 
  geom_bar(stat = "identity", width = 0.9) + 
  coord_flip()

 sample_file_1213 %>% 
  select_(.dots = names(.)[sapply(., is.numeric)]) %>% 
  group_by(Correct = (Taxable_Income != Tot_inc_amt - Tot_ded_amt - PP_loss_claimed - NPP_loss_claimed)) %>% 
  summarise_each(funs(mean)) %>% 
  melt(id.vars = "Correct") %>% 
  group_by(variable) %>% 
  summarise(ratio = first(value) / last(value)) %>% 
  ungroup %>% 
  arrange(ratio) %>% 
  as.data.table
	library(data.table)
	library(taxstats)
	library(dplyr)


	## How are the sample file columns related to each other?

	# http://stackoverflow.com/questions/13088770/how-to-write-linearly-dependent-column-in-a-matrix-in-terms-of-linearly-independ
	linfinder <- function(mat){
	# If the matrix is full rank then we're done
	if(qr(mat)$rank == ncol(mat)){
	print("Matrix is of full rank")
	return(invisible(seq(ncol(mat))))
	}
	m <- ncol(mat)
	# cols keeps track of which columns are linearly independent
	cols <- 1
	for(i in seq(2, m)){
	ids <- c(cols, i)
	mymat <- mat[, ids]
	if(qr(mymat)$rank != length(ids)){
	# Regression the column of interest on the previous
	# columns to figure out the relationship
	o <- lm(mat[,i] ~ mat[,cols] + 0)
	# Construct the output message
	start <- names(numeric.sample.file)[i]
	# Which coefs are nonzero
	nz <- !(abs(coef(o)) <= .Machine$double.eps^0.5)
	tmp <- paste("Column", cols[nz], sep = "_")
	tmp <- names(numeric.sample.file)[nz]
	coef.o <- round(coef(o), 2)
	vals <- paste(coef.o[nz], tmp, sep = "*", collapse = " + ")
	message <- paste0(start, " = ", vals)
	print(message)
	} else {
	# If the matrix subset was of full rank
	# then the newest column in linearly independent
	# so add it to the cols list
	cols <- ids
	}
	}
	return(invisible(cols))
	}

	numeric.sample.file <-
	sample_file_1213[ , which(sapply(sample_file_1213, is.numeric)), with = FALSE] %>%
	filter(complete.cases(.))
	linfinder(data.matrix(numeric.sample.file))

	# [1] "Rent_cap_wks_amt = -1Net_rent_amt + 1Gross_rent_amt + -1Other_rent_ded_amt + -1Rent_int_ded_amt + -1Other_inc_amt + 1Tot_inc_amt + -1WRE_car_amt + -1WRE_trvl_amt + -1Spouse_adjusted_taxable_inc + 1Net_fincl_invstmt_lss_amt + -1Rptbl_Empr_spr_cont_amt + -1Cr_PAYG_ITI_amt"
	# [1] "Tot_inc_amt = 1Sw_amt + 1Alow_ben_amt + 1ETP_txbl_amt + 1Grs_int_amt + 1Aust_govt_pnsn_allw_amt + 1Unfranked_Div_amt + 1Frk_Div_amt + 1Dividends_franking_cr_amt + 1Net_rent_amt + 1Rent_cap_wks_amt + 1Net_farm_management_amt + 1Net_PP_BI_amt + 1Total_NPP_BE_amt + 1Tot_CY_CG_amt + 1Net_PT_PP_dsn + 1Net_PT_NPP_dsn + 1Taxed_othr_pnsn_amt + 1Untaxed_othr_pnsn_amt + 1Other_foreign_inc_amt + 1Div_Ded_amt + 1Intrst_Ded_amt + 1Gift_amt + 1Non_emp_spr_amt + 1Cost_tax_affairs_amt + 1Other_Ded_amt + 1Tot_ded_amt + 1PP_loss_claimed + 1NPP_loss_claimed + 1Spouse_adjusted_taxable_inc + 1Net_fincl_invstmt_lss_amt + 1Rptbl_Empr_spr_cont_amt + 1Taxable_Income"
	# [1] "Tot_ded_amt = 1Other_inc_amt + 1Tot_inc_amt + 1WRE_car_amt + 1WRE_trvl_amt + 1WRE_uniform_amt + 1WRE_self_amt + 1WRE_other_amt + 1Div_Ded_amt + 1Intrst_Ded_amt + 1Gift_amt + 1*Non_emp_spr_amt"


	library(randomForest)
	library(dplyr)
	library(magrittr)
	library(parallel)
	library(foreach)

	numCores <- detectCores()
	cluster <- makePSOCKcluster(cores)


	# which columns have any NAs (and so are unsuitable for RF)
	nNA.cols <- names(sample_file_1213)[sapply(sample_file_1213, function(x){!any(is.na(x))})]

	num_trees = 100L
	trees_per_core = floor(num_trees / numCores)

	train <-
	sample_file_1213 %>%
	sample_n(20000) %>%
	select_(.dots = nNA.cols) %>%
	mutate(Correct = factor(Taxable_Income == Tot_inc_amt - Tot_ded_amt - PP_loss_claimed - NPP_loss_claimed)) %>%
	select(-c(Ind, Taxable_Income, Tot_inc_amt, Tot_ded_amt))



	RF <-
	foreach(trees=rep(trees_per_core, numCores), .combine = randomForest::combine, .multicombine = TRUE) %dopar% {
	randomForest(Correct ~ ., data = train, importance = TRUE, ntree = trees)
	}

	sample_file_1213 %>%
	sample_n(50000) %>%
	select_(.dots = nNA.cols) %>%
	mutate(Correct = factor(Taxable_Income != Tot_inc_amt - Tot_ded_amt - PP_loss_claimed - NPP_loss_claimed)) %>%
	select(-c(Ind, Taxable_Income, Tot_inc_amt, Tot_ded_amt)) %>%
	mutate(predicted = predict(RF, newdata = .)) %>%
	select(Correct, predicted) %>%
	count(Correct, predicted)


	importance(RF) %>%
	broom::tidy() %>%
	arrange(desc(MeanDecreaseAccuracy)) %>%
	mutate(x = factor(.rownames, levels = .$.rownames)) %>%
	ggplot(aes(x = x, y = MeanDecreaseAccuracy)) +
	geom_bar(stat = "identity", width = 0.9) +
	coord_flip()

	sample_file_1213 %>%
	select_(.dots = names(.)[sapply(., is.numeric)]) %>%
	group_by(Correct = (Taxable_Income != Tot_inc_amt - Tot_ded_amt - PP_loss_claimed - NPP_loss_claimed)) %>%
	summarise_each(funs(mean)) %>%
	melt(id.vars = "Correct") %>%
	group_by(variable) %>%
	summarise(ratio = first(value) / last(value)) %>%
	ungroup %>%
	arrange(ratio) %>%
	as.data.table