Created
May 22, 2012 11:42
-
-
Save arademaker/2768545 to your computer and use it in GitHub Desktop.
Script Gustavo L. A. chapter 3 of Collective Intelligence using R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
corpus2 <- lapply(corpus, getWords) | |
tira.erro <- sapply(corpus2, length) > 0 | |
corpus2 <- corpus2[tira.erro] | |
apcount <- table(unlist(lapply(corpus2, names))) | |
apcount.df <- as.data.frame(apcount, stringsAsFactors = FALSE) | |
apcount.df$f <- apcount.df$Freq / length(corpus2) | |
apcount.df <- subset(apcount.df, f > 0.1 & f < 0.5) | |
# criando a matriz: | |
blogs.names <- sapply(tmp.sample2, function(x) x[[2]])[tira.erro] | |
out.file <- matrix(NA, ncol = length(apcount.df[[1]]), nrow = length(corpus2), | |
dimnames = list(blogs.names, apcount.df[[1]])) | |
for (i in 1:length(blogs.names)) { | |
matching <- match(names(corpus2[[i]]), colnames(out.file)) | |
out.file[i,][matching[!is.na(matching)]] <- corpus2[[i]][!is.na(matching)]/length(corpus2[[i]]) | |
} | |
out.file[is.na(out.file)] <- 0 | |
# correlacao <- cor(t(out.file), use = "pairwise.complete.obs") | |
# require(graphics); require(utils) | |
hc <- hclust(dist(out.file), "ave") | |
# euclidian method | |
# http://www.stat.ucl.ac.be/ISdidactique/Rhelp/library/amap/html/dist.html | |
# A review of cluster analysis in health psychology research found | |
# that the most common distance measure in published studies in that | |
# research area is the Euclidean distance or the squared Euclidean | |
# distance. http://en.wikipedia.org/wiki/Hierarchical_clustering | |
dend <- as.dendrogram(hc) | |
plot(dend) | |
# as duas linhas acima são referentes ao conteúdo da p. 33 a p. 40 o | |
# método dele é mais mecânico, mas de qualquer modo em algum ponto | |
# teremos que nos diferenciar dele, na pior das hipóteses quando ele | |
# usa a função "drawnode" | |
# dividindo as palavras em clusters: | |
# tmp <- dist(t(out.file)) | |
# hc2 <- hclust(tmp, "ave") | |
# dend2 <- as.dendrogram(hc2) | |
# Because this function uses random centroids to start with, the order | |
# of the results returned will almost always be different. | |
tmp <- kmeans(out.file,3) | |
tmp2 <- hc | |
tmp2[[4]] <- paste(hc[[4]],tmp[[1]]) | |
dend <- as.dendrogram(tmp2) | |
plot(dend) | |
rect.hclust(hc, k=3, border="red") # separa graficamente por meio de | |
# retângulos vermelhos os k`s | |
# grupos, nesse caso os 3 grupos | |
# reparem que os grupos do hclust(dentro dos retângulos vermelhos) não | |
# foram os mesmo que os do kmeans(identificados por 1,2 ou 3 após os | |
# nomes do blog). | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment