Wu et al. describe top 10 algorithms in data mining in (LDO) "Top 10 algorithms in data mining" (2007). How to use these algorithms in R is shown here. The datasets used are available in R itself, no need to download anything. Run data()
to see the available datasets. Nothing is original here, everything was Googled, and no references are made to sources. The purpose of all this is to show how quickly you can prototype most algorithms with minimal code, in R.
require(rJava) # needed for printing strings out of Java objects
require(RWeka) # contains the J48() function that builds C4.5 decision trees
iris_c4.5 <- J48(Species ~ ., data=iris)
writeLines(rJava::.jstrVal(iris_C4.5$classifier))
iris_km <- kmeans(subset(iris, select = -Species), 3)
table(iris$Species, iris_km$cluster)
plot(iris[c("Sepal.Length", "Sepal.Width")], col=iris_km$cluster + 1, pch=19)
points(iris_km$centers[, c("Sepal.Length", "Sepal.Width")], col=2:4, pch=17, cex=3)
require(e1071)
iris_svm <- svm(Species ~ ., data = iris, method = "C-classification",
kernel = "radial", cost = 10, gamma = 0.1)
summary(iris_svm)
plot(iris_svm, iris, Petal.Width ~ Petal.Length,
slice = list(Sepal.Width = 3, Sepal.Length = 4))
require(arules)
data("Adult")
rules <- apriori(Adult, parameter = list(supp = 0.5, conf = 0.9, target = "rules"))
inspect(head(rules))
require(mixtools)
data('faithful')
wait1 <- normalmixEM(faithful$waiting, lambda = .5, mu = c(55, 80), sigma = 5)
plot(wait1, density=TRUE, cex.axis=1.4, cex.lab=1.4, cex.main=1.8, whichplots=2,
main2="Time between Old Faithful eruptions", xlab2="Minutes", ask=F)
require(igraph)
# We cheat here a bit by starting directly with a graph instead
# of building one from some data like a set of web pages.
g <- random.graph.game(20, 5/20, directed=TRUE) # get's us a directed graph
plot(g)
page.rank(g)$vector
require(rpart)
require(adabag)
set.seed(123)
train_ind <- sample(1:nrow(iris), 100)
iris_train <- iris[train_ind, ]
iris_test <- iris[-train_ind, ]
iris_adaboost <- boosting(Species~ ., data=iris_train, boos=TRUE, mfinal=10)
(predict(iris_adaboost, newdata=iris_test))$confusion
require(class)
iris_knn <- knn(train=subset(iris, select = -Species),
test=subset(iris, select = -Species),
cl=iris$Species, k = 3, prob=TRUE)
table(Actual=iris$Species, Predicted=iris_knn)
require(e1071)
iris_nb <- naiveBayes(subset(iris, select=-Species), iris$Species)
table(predict(iris_nb, subset(iris, select=-Species)),
iris$Species, dnn=list('predicted','actual'))
require(rpart)
iris_cart <- rpart(Species ~., data=iris)
plot(iris_cart)
text(iris_cart)