Created
January 26, 2012 14:40
-
-
Save thomasjensen/1683066 to your computer and use it in GitHub Desktop.
text mining of Politikken
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##read in the libraries and set the working directory | |
library(tm) | |
library(corrplot) | |
setwd("/path/to/") | |
##read in the data and subset it to the relevant categories | |
data <- read.csv("indvandringPolitikken.csv", fileEncoding = "latin1") | |
data <- data[data$kategori == "Politik" | data$kategori == "Debat" | data$kategori == "Kronikken" | data$kategori == "Leder", ] | |
##create the corpus and clean it | |
corpus <- Corpus(VectorSource(data$artikel, encoding = "latin1")) | |
corpus <- tm_map(corpus, tolower) | |
corpus <- tm_map(corpus, removePunctuation) | |
corpus <- tm_map(corpus, removeNumbers) | |
corpus <- tm_map(corpus, removeWords, stopwords("danish")) | |
##setup the document term matrix and remove sparse terms | |
dtm <- DocumentTermMatrix(corpus) | |
dtm <- removeSparseTerms(dtm, 0.99) | |
##get the top ten words | |
top10 <- as.matrix(dtm) | |
v <- apply(top10,2,sum) | |
v <- sort(v, decreasing = TRUE) | |
v1 <- sort(v[1:10]) | |
barplot(v1, horiz=TRUE, cex.names = 0.7, las = 1, col=grey.colors(10), main="Frequency of Terms") | |
##get the names of the 10 words that correlate the highest with "indvandring" | |
words <- names(findAssocs(dtm, "indvandring", .2)[2:11]) | |
oi <- as.matrix(dtm) | |
find <- colnames(oi) %in% words | |
corr <- cor(oi[,find]) | |
corrplot(corr) | |
##get the names of the 10 words that correlate the highest with "islam" | |
words <- names(findAssocs(dtm, "islam", .2)[2:11]) | |
oi <- as.matrix(dtm) | |
find <- colnames(oi) %in% words | |
corr <- cor(oi[,find]) | |
corrplot(corr) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment