Skip to content

Instantly share code, notes, and snippets.

@josefslerka
Created December 1, 2012 19:39
Show Gist options
  • Save josefslerka/4184428 to your computer and use it in GitHub Desktop.
Save josefslerka/4184428 to your computer and use it in GitHub Desktop.
Normalized Compression Distance
# Normalized Compression Distance
# http://en.wikipedia.org/wiki/Normalized_Compression_Distance
# Quick and dirty implementation
ncd <- function(file1, file2) {
both <- ""
file1 <- toString(file1)
file2 <- toString(file2)
comp1 <- memCompress(file1, type=c(c("bzip2")))
comp2 <- memCompress(file2, type=c(c("bzip2")))
lencomp1 = length(comp1)
lencomp2 = length(comp2)
both <- paste(both, file1, file2, sep="")
compboth <- memCompress(both, type=c(c("bzip2")))
lencompboth = length(compboth)
result <- (lencompboth - min(lencomp1, lencomp2))/max(lencomp1,lencomp2)
result
}
library(tm)
romany <- Corpus(DirSource("romany"), readerControl = list(language = "cz"))
# matrix
mydata.vectors <- character(0)
for(i in 1:10) {
for(x in 1:10) {
ncdromany <- ncd(romany[[i]],romany[[x]])
mydata.vectors <- c(ncdromany,mydata.vectors)
}
}
a <- matrix(rev(mydata.vectors), nrow=10)
names.vectors <- character(0)
for(x in 1:10) { names.vectors <- c(ID(romany[[x]]),names.vectors) }
dimnames(a) <- list(rev(names.vectors), rev(names.vectors))
d <- dist(a, method = "euclidean")
fit <- hclust(d, method="ward")
plot(fit) # display dendogram
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment