Last active
April 17, 2017 09:03
-
-
Save MartinMacharia/3158b360858bf234b4d5715ca779fe25 to your computer and use it in GitHub Desktop.
Random Forest
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# soils | |
# set this to the xlsx file on your computer | |
setwd("c:/Users/machariam/Desktop/Soil_R_project") # Change working directory | |
# load libraries | |
lapply(c("data.table", "DescTools", "stringr", "ggplot2", "readxl", "ranger", "RWeka", "Boruta", "DMwR"), require, character.only = T) | |
# load data | |
myfiles = dir(pattern = "xlsx") #These functions produce a character vector of the names of files or directories in the named directory. | |
myfiles #See no value in this unless it can be linked with the read_excel below | |
df = read_excel(myfiles, na = "") # Load the excel, still need a way to sort out the data structure and missing values | |
df | |
setDT(df)#When working on large lists or data.frames, it might be both time and memory consuming to convert them to a data.table using as.data.table(.), as this will make a complete copy of the input object before to convert it to a data.table. The setDT function takes care of this issue by allowing to convert lists - both named and unnamed lists and data.frames by reference instead. That is, the input object is modified in place, no copy is being made. | |
str(df) | |
names(df) | |
# select some variables | |
m = df[ , .(Nrate,Prate,Krate,Pppm,ExchK,ExchCa,ExchMg,ExchNa,ExchAl,CEC,ECEC,Sand,Silt,Clay,GrainYield)] | |
m | |
str(m) | |
# plot missing values and impute | |
PlotMiss(m, clust = F)# {DescTools}Takes a data.frame or data.table and displays the location of missing data. The missings can be clustered and be displayed together. ??PlotMiss | |
m = knnImputation(m) #Not sufficient complete cases for computing neighbors. | |
View(m) | |
library("VIM") | |
kNN(m, k=3) | |
#More on imputation | |
#Try MICE package | |
require(mice) | |
pMiss <- function(m){sum(is.na(m))/length(m)*100} | |
pMiss | |
apply(m,2,pMiss) | |
apply(m,1,pMiss) | |
library(VIM) | |
aggr_plot <- aggr(m, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(m), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern")) | |
# which variables are important? | |
Boruta(GrainYield ~., m, doTrace = 2) | |
# model | |
r = ranger(grainyield ~., m, importance = "impurity", write.forest = T) | |
r$variable.importance %>% PlotDot | |
pred = predict(r, m) | |
# plot | |
p = data.table(yield = m$grainyield, pred.yield = pred$predictions) | |
cor(m$grainyield, pred$predictions) | |
m[, var := df$variety1] | |
m[is.na(var), var := "unknown"] | |
# what are the frequency distributions of the variables? | |
# (three guesses: *not* normal ;-)) | |
Desc(m, plot = T) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment