Top 10 algorithms in data mining -- using R

Top 10 algorithms in data mining - with R

Wu et al. describe top 10 algorithms in data mining in (LDO) "Top 10 algorithms in data mining" (2007). How to use these algorithms in R is shown here. The datasets used are available in R itself, no need to download anything. Run data() to see the available datasets. Nothing is original here, everything was Googled, and no references are made to sources. The purpose of all this is to show how quickly you can prototype most algorithms with minimal code, in R.

1. C4.5

require(rJava) # needed for printing strings out of Java objects
require(RWeka) # contains the J48() function that builds C4.5 decision trees 
iris_c4.5 <- J48(Species ~ ., data=iris)
writeLines(rJava::.jstrVal(iris_C4.5$classifier))

2. k-means

iris_km <- kmeans(subset(iris, select = -Species), 3)
table(iris$Species, iris_km$cluster)
plot(iris[c("Sepal.Length", "Sepal.Width")], col=iris_km$cluster + 1, pch=19)
points(iris_km$centers[, c("Sepal.Length", "Sepal.Width")], col=2:4, pch=17, cex=3)

3. Support Vector Machines

require(e1071)
iris_svm <- svm(Species ~ ., data = iris, method = "C-classification", 
             kernel = "radial", cost = 10, gamma = 0.1)
summary(iris_svm)
plot(iris_svm, iris, Petal.Width ~ Petal.Length, 
     slice = list(Sepal.Width = 3, Sepal.Length = 4))

4. The Apriori algorithm

require(arules)
data("Adult")
rules <- apriori(Adult, parameter = list(supp = 0.5, conf = 0.9, target = "rules"))
inspect(head(rules))

5. The EM algorithm

require(mixtools)
data('faithful')
wait1 <- normalmixEM(faithful$waiting, lambda = .5, mu = c(55, 80), sigma = 5)
plot(wait1, density=TRUE, cex.axis=1.4, cex.lab=1.4, cex.main=1.8, whichplots=2, 
     main2="Time between Old Faithful eruptions", xlab2="Minutes", ask=F)

6. Page Rank

require(igraph)
# We cheat here a bit by starting directly with a graph instead
# of building one from some data like a set of web pages.
g <- random.graph.game(20, 5/20, directed=TRUE) # get's us a directed graph
plot(g)
page.rank(g)$vector

7. AdaBoost

require(rpart)
require(adabag)
set.seed(123)
train_ind <- sample(1:nrow(iris), 100)
iris_train <- iris[train_ind, ]
iris_test <- iris[-train_ind, ]
iris_adaboost <- boosting(Species~ ., data=iris_train, boos=TRUE, mfinal=10)
(predict(iris_adaboost, newdata=iris_test))$confusion

8. k-nearest neighbor

require(class)
iris_knn <- knn(train=subset(iris, select = -Species), 
                test=subset(iris, select = -Species), 
                cl=iris$Species, k = 3, prob=TRUE)
table(Actual=iris$Species, Predicted=iris_knn)

9. Naive Bayes

require(e1071)
iris_nb <- naiveBayes(subset(iris, select=-Species), iris$Species) 
table(predict(iris_nb, subset(iris, select=-Species)), 
    iris$Species, dnn=list('predicted','actual'))

10. CART

require(rpart)
iris_cart <- rpart(Species ~., data=iris)
plot(iris_cart)
text(iris_cart)

Dmitrii-I/top_10_data_mining_algos_using_R.md