thomasjensen · January 26, 2012 14:40
diff --git a/textmining.r b/textmining.r
 ##read in the libraries and set the working directory
 library(tm)
 library(corrplot)
 setwd("/path/to/")

 ##read in the data and subset it to the relevant categories
 data <- read.csv("indvandringPolitikken.csv", fileEncoding = "latin1")
 data <- data[data$kategori == "Politik" | data$kategori == "Debat" | data$kategori == "Kronikken" | data$kategori == "Leder", ]

 ##create the corpus and clean it
 corpus <- Corpus(VectorSource(data$artikel, encoding = "latin1"))
 corpus <- tm_map(corpus, tolower)
 corpus <- tm_map(corpus, removePunctuation)
 corpus <- tm_map(corpus, removeNumbers)
 corpus <- tm_map(corpus, removeWords, stopwords("danish"))

 ##setup the document term matrix and remove sparse terms
 dtm <- DocumentTermMatrix(corpus)
 dtm <- removeSparseTerms(dtm, 0.99)

 ##get the top ten words
 top10 <- as.matrix(dtm)

 v <- apply(top10,2,sum)
 v <- sort(v, decreasing = TRUE)
 v1 <- sort(v[1:10])

 barplot(v1, horiz=TRUE, cex.names = 0.7, las = 1, col=grey.colors(10), main="Frequency of Terms")

 ##get the names of the 10 words that correlate the highest with "indvandring"
 words <- names(findAssocs(dtm, "indvandring", .2)[2:11])
 oi <- as.matrix(dtm)
 find <- colnames(oi) %in% words
 corr <- cor(oi[,find])
 corrplot(corr)

 ##get the names of the 10 words that correlate the highest with "islam"
 words <- names(findAssocs(dtm, "islam", .2)[2:11])
 oi <- as.matrix(dtm)
 find <- colnames(oi) %in% words
 corr <- cor(oi[,find])
 corrplot(corr)
	##read in the libraries and set the working directory
	library(tm)
	library(corrplot)
	setwd("/path/to/")

	##read in the data and subset it to the relevant categories
	data <- read.csv("indvandringPolitikken.csv", fileEncoding = "latin1")
	data <- data[data$kategori == "Politik" \| data$kategori == "Debat" \| data$kategori == "Kronikken" \| data$kategori == "Leder", ]

	##create the corpus and clean it
	corpus <- Corpus(VectorSource(data$artikel, encoding = "latin1"))
	corpus <- tm_map(corpus, tolower)
	corpus <- tm_map(corpus, removePunctuation)
	corpus <- tm_map(corpus, removeNumbers)
	corpus <- tm_map(corpus, removeWords, stopwords("danish"))

	##setup the document term matrix and remove sparse terms
	dtm <- DocumentTermMatrix(corpus)
	dtm <- removeSparseTerms(dtm, 0.99)

	##get the top ten words
	top10 <- as.matrix(dtm)

	v <- apply(top10,2,sum)
	v <- sort(v, decreasing = TRUE)
	v1 <- sort(v[1:10])

	barplot(v1, horiz=TRUE, cex.names = 0.7, las = 1, col=grey.colors(10), main="Frequency of Terms")

	##get the names of the 10 words that correlate the highest with "indvandring"
	words <- names(findAssocs(dtm, "indvandring", .2)[2:11])
	oi <- as.matrix(dtm)
	find <- colnames(oi) %in% words
	corr <- cor(oi[,find])
	corrplot(corr)

	##get the names of the 10 words that correlate the highest with "islam"
	words <- names(findAssocs(dtm, "islam", .2)[2:11])
	oi <- as.matrix(dtm)
	find <- colnames(oi) %in% words
	corr <- cor(oi[,find])
	corrplot(corr)