Skip to content

Instantly share code, notes, and snippets.

@thomasjensen
Created January 26, 2012 14:40
Show Gist options
  • Save thomasjensen/1683066 to your computer and use it in GitHub Desktop.
Save thomasjensen/1683066 to your computer and use it in GitHub Desktop.
text mining of Politikken
##read in the libraries and set the working directory
library(tm)
library(corrplot)
setwd("/path/to/")
##read in the data and subset it to the relevant categories
data <- read.csv("indvandringPolitikken.csv", fileEncoding = "latin1")
data <- data[data$kategori == "Politik" | data$kategori == "Debat" | data$kategori == "Kronikken" | data$kategori == "Leder", ]
##create the corpus and clean it
corpus <- Corpus(VectorSource(data$artikel, encoding = "latin1"))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("danish"))
##setup the document term matrix and remove sparse terms
dtm <- DocumentTermMatrix(corpus)
dtm <- removeSparseTerms(dtm, 0.99)
##get the top ten words
top10 <- as.matrix(dtm)
v <- apply(top10,2,sum)
v <- sort(v, decreasing = TRUE)
v1 <- sort(v[1:10])
barplot(v1, horiz=TRUE, cex.names = 0.7, las = 1, col=grey.colors(10), main="Frequency of Terms")
##get the names of the 10 words that correlate the highest with "indvandring"
words <- names(findAssocs(dtm, "indvandring", .2)[2:11])
oi <- as.matrix(dtm)
find <- colnames(oi) %in% words
corr <- cor(oi[,find])
corrplot(corr)
##get the names of the 10 words that correlate the highest with "islam"
words <- names(findAssocs(dtm, "islam", .2)[2:11])
oi <- as.matrix(dtm)
find <- colnames(oi) %in% words
corr <- cor(oi[,find])
corrplot(corr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment