MartinMacharia · April 17, 2017 09:03
diff --git a/Random Forest b/Random Forest
 # soils
 # set this to the xlsx file on your computer
 setwd("c:/Users/machariam/Desktop/Soil_R_project") # Change working directory
 # load libraries
 lapply(c("data.table", "DescTools", "stringr", "ggplot2", "readxl", "ranger", "RWeka", "Boruta", "DMwR"), require, character.only = T)
 # load data
 myfiles = dir(pattern = "xlsx") #These functions produce a character vector of the names of files or directories in the named directory.
 myfiles #See no value in this unless it can be linked with the read_excel below
 df = read_excel(myfiles, na = "") # Load the excel, still need a way to sort out the data structure and missing values
 df
 setDT(df)#When working on large lists or data.frames, it might be both time and memory consuming to convert them to a data.table using as.data.table(.), as this will make a complete copy of the input object before to convert it to a data.table. The setDT function takes care of this issue by allowing to convert lists - both named and unnamed lists and data.frames by reference instead. That is, the input object is modified in place, no copy is being made.
 str(df)
 names(df)
 # select some variables
 m = df[ , .(Nrate,Prate,Krate,Pppm,ExchK,ExchCa,ExchMg,ExchNa,ExchAl,CEC,ECEC,Sand,Silt,Clay,GrainYield)]
 m
 str(m)
 # plot missing values and impute
 PlotMiss(m, clust = F)# {DescTools}Takes a data.frame or data.table and displays the location of missing data. The missings can be clustered and be displayed together. ??PlotMiss
 m = knnImputation(m) #Not sufficient complete cases for computing neighbors.
 View(m)
 library("VIM")
 kNN(m, k=3)

 #More on imputation
 #Try MICE package

 require(mice)
 pMiss <- function(m){sum(is.na(m))/length(m)*100}
 pMiss 
 apply(m,2,pMiss)
 apply(m,1,pMiss)
 library(VIM)
 aggr_plot <- aggr(m, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(m), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))


 # which variables are important?
 Boruta(GrainYield ~., m, doTrace = 2)
 # model
 r = ranger(grainyield ~., m, importance = "impurity", write.forest = T)
 r$variable.importance %>% PlotDot
 pred = predict(r, m)

 # plot
 p = data.table(yield = m$grainyield, pred.yield = pred$predictions)
 cor(m$grainyield, pred$predictions)

 m[, var := df$variety1]
 m[is.na(var), var := "unknown"]

 # what are the frequency distributions of the variables?
 # (three guesses: *not* normal ;-))
 Desc(m, plot = T)
	# soils
	# set this to the xlsx file on your computer
	setwd("c:/Users/machariam/Desktop/Soil_R_project") # Change working directory
	# load libraries
	lapply(c("data.table", "DescTools", "stringr", "ggplot2", "readxl", "ranger", "RWeka", "Boruta", "DMwR"), require, character.only = T)
	# load data
	myfiles = dir(pattern = "xlsx") #These functions produce a character vector of the names of files or directories in the named directory.
	myfiles #See no value in this unless it can be linked with the read_excel below
	df = read_excel(myfiles, na = "") # Load the excel, still need a way to sort out the data structure and missing values
	df
	setDT(df)#When working on large lists or data.frames, it might be both time and memory consuming to convert them to a data.table using as.data.table(.), as this will make a complete copy of the input object before to convert it to a data.table. The setDT function takes care of this issue by allowing to convert lists - both named and unnamed lists and data.frames by reference instead. That is, the input object is modified in place, no copy is being made.
	str(df)
	names(df)
	# select some variables
	m = df[ , .(Nrate,Prate,Krate,Pppm,ExchK,ExchCa,ExchMg,ExchNa,ExchAl,CEC,ECEC,Sand,Silt,Clay,GrainYield)]
	m
	str(m)
	# plot missing values and impute
	PlotMiss(m, clust = F)# {DescTools}Takes a data.frame or data.table and displays the location of missing data. The missings can be clustered and be displayed together. ??PlotMiss
	m = knnImputation(m) #Not sufficient complete cases for computing neighbors.
	View(m)
	library("VIM")
	kNN(m, k=3)

	#More on imputation
	#Try MICE package

	require(mice)
	pMiss <- function(m){sum(is.na(m))/length(m)*100}
	pMiss
	apply(m,2,pMiss)
	apply(m,1,pMiss)
	library(VIM)
	aggr_plot <- aggr(m, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(m), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))


	# which variables are important?
	Boruta(GrainYield ~., m, doTrace = 2)
	# model
	r = ranger(grainyield ~., m, importance = "impurity", write.forest = T)
	r$variable.importance %>% PlotDot
	pred = predict(r, m)

	# plot
	p = data.table(yield = m$grainyield, pred.yield = pred$predictions)
	cor(m$grainyield, pred$predictions)

	m[, var := df$variety1]
	m[is.na(var), var := "unknown"]

	# what are the frequency distributions of the variables?
	# (three guesses: not normal ;-))
	Desc(m, plot = T)