Last active
August 29, 2015 14:03
-
-
Save not-for-me/d3b9539e8ea81aa434a1 to your computer and use it in GitHub Desktop.
Clustering text(bibile) wirh R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Library Load | |
| library(tm) | |
| # Set file Paths | |
| otFilePath <- "~/Documents/mining/project/old" | |
| ntFilePath <- "~/Documents/mining/project/new" | |
| # Import txt to TextCorpus | |
| oldTextCorpus <- Corpus(DirSource(otFilePath), readerControl = list(reader = readPlain, language = "en")) | |
| newTextCorpus <- Corpus(DirSource(ntFilePath), readerControl = list(reader = readPlain, language = "en")) | |
| summary(oldTextCorpus) | |
| summary(newTextCorpus) | |
| # Text Preprocessing | |
| oldTextCorpus <- tm_map(oldTextCorpus, content_transformer(tolower)) | |
| oldTextCorpus <- tm_map(oldTextCorpus, removeWords, stopwords("english")) | |
| myStopWords <- c("also", "among", "like", "may", "must", "shall", "take", "went", "will") | |
| oldTextCorpus <- tm_map(oldTextCorpus, removeWords, myStopWords ) | |
| oldTextCorpus <- tm_map(oldTextCorpus, removeNumbers) | |
| oldTextCorpus <- tm_map(oldTextCorpus, removePunctuation) | |
| oldTextCorpus <- tm_map(oldTextCorpus, stripWhitespace) | |
| library(SnowballC) | |
| oldTextCorpus <- tm_map(oldTextCorpus, stemDocument) | |
| newTextCorpus <- tm_map(newTextCorpus, content_transformer(tolower)) | |
| newTextCorpus <- tm_map(newTextCorpus, removeWords, stopwords("english")) | |
| myStopWords <- c("also", "among", "like", "may", "must", "shall", "take", "went", "will") | |
| newTextCorpus <- tm_map(newTextCorpus, removeWords, myStopWords ) | |
| newTextCorpus <- tm_map(newTextCorpus, removeNumbers) | |
| newTextCorpus <- tm_map(newTextCorpus, removePunctuation) | |
| newTextCorpus <- tm_map(newTextCorpus, stripWhitespace) | |
| newTextCorpus <- tm_map(newTextCorpus, stemDocument) | |
| bibleCorpus <- c(oldTextCorpus, newTextCorpus) | |
| old_dtm <-DocumentTermMatrix(oldTextCorpus) | |
| dim(old_dtm) | |
| new_dtm <-DocumentTermMatrix(newTextCorpus) | |
| dim(new_dtm) | |
| bible_dtm <-DocumentTermMatrix(bibleCorpus) | |
| dim(bible_dtm) | |
| # Remove Sparse Terms | |
| old_stm <- removeSparseTerms(old_dtm, 0.8) | |
| dim(old_stm) | |
| new_stm <- removeSparseTerms(new_dtm, 0.8) | |
| dim(new_stm) | |
| bible_stm <- removeSparseTerms(bible_dtm, 0.8) | |
| dim(bible_stm) | |
| old_tdm <- t(old_stm) | |
| dim(old_tdm) | |
| new_tdm <- t(new_stm) | |
| dim(new_tdm) | |
| bible_tdm <- t(bible_stm) | |
| dim(bible_tdm) | |
| # Convert stm to dataframe | |
| old_df <- as.data.frame(inspect(old_stm)) | |
| new_df <- as.data.frame(inspect(new_stm)) | |
| bible_df <- as.data.frame(inspect(bible_stm)) | |
| ## hierarchical clustering | |
| library(proxy) | |
| library(ggplot2) | |
| ## cosine Distance / single method | |
| old_d <- dist(old_df, method="cosine") | |
| old_hc <- hclust(old_d , method="single") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="cosine") | |
| new_hc <- hclust(new_d , method="single") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="cosine") | |
| bible_hc <- hclust(bible_d , method="single") | |
| plot(bible_hc) | |
| ## cosine Distance / complete method | |
| old_d <- dist(old_df, method="cosine") | |
| old_hc <- hclust(old_d , method="complete") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="cosine") | |
| new_hc <- hclust(new_d , method="complete") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="cosine") | |
| bible_hc <- hclust(bible_d , method="complete") | |
| plot(bible_hc) | |
| ## cosine Distance / average method | |
| old_d <- dist(old_df, method="cosine") | |
| old_hc <- hclust(old_d , method="average") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="cosine") | |
| new_hc <- hclust(new_d , method="average") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="cosine") | |
| bible_hc <- hclust(bible_d , method="average") | |
| plot(bible_hc) | |
| ## cosine Distance / median method | |
| old_d <- dist(old_df, method="cosine") | |
| old_hc <- hclust(old_d , method="median") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="cosine") | |
| new_hc <- hclust(new_d , method="median") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="cosine") | |
| bible_hc <- hclust(bible_d , method="median") | |
| plot(bible_hc) | |
| ## cosine Distance / centroid method | |
| old_d <- dist(old_df, method="cosine") | |
| old_hc <- hclust(old_d , method="centroid") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="cosine") | |
| new_hc <- hclust(new_d , method="centroid") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="cosine") | |
| bible_hc <- hclust(bible_d , method="centroid") | |
| plot(bible_hc) | |
| ## cosine Distance / ward.D method | |
| old_d <- dist(old_df, method="cosine") | |
| old_hc <- hclust(old_d , method="ward.D") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="cosine") | |
| new_hc <- hclust(new_d , method="ward.D") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="cosine") | |
| bible_hc <- hclust(bible_d , method="ward.D") | |
| plot(bible_hc) | |
| ## cosine Distance / ward.D2 method | |
| old_d <- dist(old_df, method="cosine") | |
| old_hc <- hclust(old_d , method="ward.D2") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="cosine") | |
| new_hc <- hclust(new_d , method="ward.D2") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="cosine") | |
| bible_hc <- hclust(bible_d , method="ward.D2") | |
| plot(bible_hc) | |
| ## cosine Distance / mcquitty method | |
| old_d <- dist(old_df, method="cosine") | |
| old_hc <- hclust(old_d , method="mcquitty") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="cosine") | |
| new_hc <- hclust(new_d , method="mcquitty") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="cosine") | |
| bible_hc <- hclust(bible_d , method="mcquitty") | |
| plot(bible_hc) | |
| ## euclidean Distance / single method | |
| old_d <- dist(old_df, method="euclidean") | |
| old_hc <- hclust(old_d , method="single") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="euclidean") | |
| new_hc <- hclust(new_d , method="single") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="euclidean") | |
| bible_hc <- hclust(bible_d , method="single") | |
| plot(bible_hc) | |
| ## euclidean Distance / complete method | |
| old_d <- dist(old_df, method="euclidean") | |
| old_hc <- hclust(old_d , method="complete") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="euclidean") | |
| new_hc <- hclust(new_d , method="complete") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="euclidean") | |
| bible_hc <- hclust(bible_d , method="complete") | |
| plot(bible_hc) | |
| ## euclidean Distance / average method | |
| old_d <- dist(old_df, method="euclidean") | |
| old_hc <- hclust(old_d , method="average") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="euclidean") | |
| new_hc <- hclust(new_d , method="average") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="euclidean") | |
| bible_hc <- hclust(bible_d , method="average") | |
| plot(bible_hc) | |
| ## euclidean Distance / median method | |
| old_d <- dist(old_df, method="euclidean") | |
| old_hc <- hclust(old_d , method="median") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="euclidean") | |
| new_hc <- hclust(new_d , method="median") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="euclidean") | |
| bible_hc <- hclust(bible_d , method="median") | |
| plot(bible_hc) | |
| ## euclidean Distance / centroid method | |
| old_d <- dist(old_df, method="euclidean") | |
| old_hc <- hclust(old_d , method="centroid") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="euclidean") | |
| new_hc <- hclust(new_d , method="centroid") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="euclidean") | |
| bible_hc <- hclust(bible_d , method="centroid") | |
| plot(bible_hc) | |
| ## euclidean Distance / ward.D method | |
| old_d <- dist(old_df, method="euclidean") | |
| old_hc <- hclust(old_d , method="ward.D") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="euclidean") | |
| new_hc <- hclust(new_d , method="ward.D") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="euclidean") | |
| bible_hc <- hclust(bible_d , method="ward.D") | |
| plot(bible_hc) | |
| ## euclidean Distance / ward.D2 method | |
| old_d <- dist(old_df, method="euclidean") | |
| old_hc <- hclust(old_d , method="ward.D2") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="euclidean") | |
| new_hc <- hclust(new_d , method="ward.D2") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="euclidean") | |
| bible_hc <- hclust(bible_d , method="ward.D2") | |
| plot(bible_hc) | |
| ## euclidean Distance / mcquitty method | |
| old_d <- dist(old_df, method="euclidean") | |
| old_hc <- hclust(old_d , method="mcquitty") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="euclidean") | |
| new_hc <- hclust(new_d , method="mcquitty") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="euclidean") | |
| bible_hc <- hclust(bible_d , method="mcquitty") | |
| plot(bible_hc) | |
| ## manhattan Distance / single method | |
| old_d <- dist(old_df, method="manhattan") | |
| old_hc <- hclust(old_d , method="single") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="manhattan") | |
| new_hc <- hclust(new_d , method="single") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="manhattan") | |
| bible_hc <- hclust(bible_d , method="single") | |
| plot(bible_hc) | |
| ## manhattan Distance / complete method | |
| old_d <- dist(old_df, method="manhattan") | |
| old_hc <- hclust(old_d , method="complete") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="manhattan") | |
| new_hc <- hclust(new_d , method="complete") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="manhattan") | |
| bible_hc <- hclust(bible_d , method="complete") | |
| plot(bible_hc) | |
| ## manhattan Distance / average method | |
| old_d <- dist(old_df, method="manhattan") | |
| old_hc <- hclust(old_d , method="average") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="manhattan") | |
| new_hc <- hclust(new_d , method="average") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="manhattan") | |
| bible_hc <- hclust(bible_d , method="average") | |
| plot(bible_hc) | |
| ## manhattan Distance / median method | |
| old_d <- dist(old_df, method="manhattan") | |
| old_hc <- hclust(old_d , method="median") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="manhattan") | |
| new_hc <- hclust(new_d , method="median") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="manhattan") | |
| bible_hc <- hclust(bible_d , method="median") | |
| plot(bible_hc) | |
| ## manhattan Distance / centroid method | |
| old_d <- dist(old_df, method="manhattan") | |
| old_hc <- hclust(old_d , method="centroid") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="manhattan") | |
| new_hc <- hclust(new_d , method="centroid") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="manhattan") | |
| bible_hc <- hclust(bible_d , method="centroid") | |
| plot(bible_hc) | |
| ## manhattan Distance / ward.D method | |
| old_d <- dist(old_df, method="manhattan") | |
| old_hc <- hclust(old_d , method="ward.D") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="manhattan") | |
| new_hc <- hclust(new_d , method="ward.D") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="manhattan") | |
| bible_hc <- hclust(bible_d , method="ward.D") | |
| plot(bible_hc) | |
| ## manhattan Distance / ward.D2 method | |
| old_d <- dist(old_df, method="manhattan") | |
| old_hc <- hclust(old_d , method="ward.D2") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="manhattan") | |
| new_hc <- hclust(new_d , method="ward.D2") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="manhattan") | |
| bible_hc <- hclust(bible_d , method="ward.D2") | |
| plot(bible_hc) | |
| ## manhattan Distance / mcquitty method | |
| old_d <- dist(old_df, method="manhattan") | |
| old_hc <- hclust(old_d , method="mcquitty") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="manhattan") | |
| new_hc <- hclust(new_d , method="mcquitty") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="manhattan") | |
| bible_hc <- hclust(bible_d , method="mcquitty") | |
| plot(bible_hc) | |
| ## canberra Distance / ward.D2 method | |
| old_d <- dist(old_df, method="canberra") | |
| old_hc <- hclust(old_d , method="ward.D2") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="canberra") | |
| new_hc <- hclust(new_d , method="ward.D2") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="canberra") | |
| bible_hc <- hclust(bible_d , method="ward.D2") | |
| plot(bible_hc) | |
| ## canberra Distance / mcquitty method | |
| old_d <- dist(old_df, method="canberra") | |
| old_hc <- hclust(old_d , method="mcquitty") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="canberra") | |
| new_hc <- hclust(new_d , method="mcquitty") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="canberra") | |
| bible_hc <- hclust(bible_d , method="mcquitty") | |
| plot(bible_hc) | |
| ## binary Distance / ward.D2 method | |
| old_d <- dist(old_df, method="binary") | |
| old_hc <- hclust(old_d , method="ward.D2") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="binary") | |
| new_hc <- hclust(new_d , method="ward.D2") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="binary") | |
| bible_hc <- hclust(bible_d , method="ward.D2") | |
| plot(bible_hc) | |
| ## binary Distance / mcquitty method | |
| old_d <- dist(old_df, method="binary") | |
| old_hc <- hclust(old_d , method="mcquitty") | |
| plot(old_hc) | |
| new_d <- dist(new_df, method="binary") | |
| new_hc <- hclust(new_d , method="mcquitty") | |
| plot(new_hc) | |
| bible_d <- dist(bible_df, method="binary") | |
| bible_hc <- hclust(bible_d , method="mcquitty") | |
| plot(bible_hc) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment