Last active
December 17, 2015 11:39
-
-
Save Guibrich/5604146 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 18/05/2013 | |
# Key words : TextMining, Elections, France, Debate, 2nd Round | |
# We use the packages qdap from (donner le lien) and | |
# tm to perform textmining analysis and the classical | |
# package like ggplot or RColorBrewer to get the graphics pretty. | |
suppressPackageStartupMessages(require(twitteR)) | |
suppressPackageStartupMessages(require(XML)) | |
suppressPackageStartupMessages(require(tm)) | |
suppressPackageStartupMessages(require(rgdal)) | |
suppressPackageStartupMessages(require(ggplot2)) | |
suppressPackageStartupMessages(require(qdap)) | |
suppressPackageStartupMessages(require(rJava)) | |
suppressPackageStartupMessages(library(wordcloud)) | |
library(Rstem) | |
setwd("D:/PERSO/R_Working/Tutoriels/TextMining") | |
# Hollande | |
debate <- read.transcript("./Data/debat2tours.docx", col.names=c("person", "dialogue")) | |
htruncdf(debate,5,50) | |
# We keep just Holland's word | |
Hollande = subset(debate,person=="HOLLANDE") | |
# We define the stop words | |
sw=c("a","ou",tm::stopwords("fr"),"c'est", "n'est","s'y","qu'on","s'il","ah", | |
letters,"ca","n'y","d'un","monsieur") | |
generateCorpus= function(df,my.stopwords=c()){ | |
text2.corpus= Corpus(VectorSource(df),readerControl=list(language="fr")) | |
text2.corpus = tm_map(text2.corpus, removePunctuation) | |
text2.corpus = tm_map(text2.corpus, tolower) | |
text2.corpus= tm_map(text2.corpus, removeNumbers) | |
text2.corpus = tm_map(text2.corpus, removeWords, stopwords("fr")) | |
text2.corpus = tm_map(text2.corpus, removeWords, my.stopwords) | |
#text2.corpus <- tm_map(text2.corpus, stemDocument, language = "french") | |
} | |
HollandeCorpus<-generateCorpus(Hollande,sw) | |
# We build a Term Document Matrix | |
H.tdm <- TermDocumentMatrix(HollandeCorpus) | |
H.m <- as.matrix(H.tdm) | |
H.v <- sort(rowSums(H.m),decreasing=TRUE) | |
H.d <- data.frame(word = names(H.v),freq=H.v) | |
H.d = subset(H.d,freq<=90) | |
H.d = subset(H.d,freq>=3) | |
H.d$stem <- wordStem(row.names(H.d), language = "french") | |
# and put words to column, otherwise they would be lost when aggregating | |
H.d$word <- row.names(H.d) | |
agg_freq <- stats::aggregate(freq ~ stem, data = H.d, sum) | |
agg_word <- stats::aggregate(word ~ stem, data = H.d, function(x) x[1]) | |
forW <- cbind(freq = agg_freq[, 2], agg_word) | |
# sort by frequency | |
forW <- forW[order(forW$freq, decreasing = T), ] | |
# Wordcloud | |
col<- brewer.pal(8,"Dark2") | |
png("wordcloud_Hollande.png", width=1280,height=800) | |
wordcloud(forW$word,forW$freq, scale=c(8,.2),min.freq=5, | |
max.words=Inf, random.order=FALSE, rot.per=.20, colors=col) | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment