Last active
October 13, 2015 05:48
-
-
Save josefslerka/4148592 to your computer and use it in GitHub Desktop.
Digital Humanities 7
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tm) | |
library(textcat) | |
library(tm) | |
library(tau) | |
library(openNLP) | |
inspect(havel[1:2]) | |
havel[[1]] #pristup ke konkretnimu dokumentu | |
havel <- tm_map(havel, stripWhitespace) | |
havel <- tm_map(havel, tolower) | |
dtmHavel <- DocumentTermMatrix(corpusHavel) | |
# build a term-document matrix | |
mydata.corpus <- iconv(mydata.corpus, from="utf8", to="ASCII//TRANSLIT") | |
mydata.corpus <- gsub("\'", "", mydata.corpus) | |
mydata.corpus <- gsub("\"", "", mydata.corpus) | |
mydata.corpus <- gsub("\\n", "", mydata.corpus) | |
#### | |
library(tm) | |
corpusHavel <- Corpus(DirSource("havel", encoding = "UTF-8") , readerControl = list(language = "cz")) | |
corpusKlaus <- Corpus(DirSource("klaus"), readerControl = list(language = "cz")) | |
mydata.corpus <- corpusHavel | |
mydata.corpus <- iconv(mydata.corpus, from="utf8", to="ASCII//TRANSLIT") | |
mydata.corpus <- gsub("\'", "", mydata.corpus) | |
mydata.corpus <- gsub("\"", "", mydata.corpus) | |
mydata.corpus <- gsub("\\n", "", mydata.corpus) | |
mydata.corpus <- Corpus(VectorSource(mydata.corpus)) | |
mydata.corpus <- tm_map(mydata.corpus, tolower) | |
mydata.corpus <- tm_map(mydata.corpus, removePunctuation) | |
my_stopwords <- c(stopwords('english'), 'se', 'na', 'v', 'co', 'ze', | |
'bych','dekuji','kdy','tomu','totiz', | |
'panove', 'vazeni', 'svou', 'velmi', | |
'abych','toto', | |
'o', 'je', 'k', 'z', 'proti', 'neni','byly', 'si', 'dnes', 'cz', 'timto', 'budes', 'budem', 'byli', 'jses', 'muj', 'svym', 'ta', 'tomto', 'tohle', 'tuto', 'tyto', 'jej', 'zda', 'proc', 'mate', 'tato', 'kam', 'tohoto', 'kdo', 'kteri', 'mi', 'nam', 'tom', 'tomuto', 'mit', 'nic', 'proto', 'kterou', 'byla', 'toho', 'protoze', 'asi', 'ho', 'nasi', 'napiste', 're', 'rt', 'coz', 'tim', 'takze', 'svych', 'jeji', 'svymi', 'jste', 'aj', 'tu', 'tedy', 'teto', 'bylo', 'kde', 'ke', 'prave', 'ji', 'nad', 'nejsou', 'ci', 'pod', 'tema', 'mezi', 'pres', 'ty', 'pak', 'vam', 'ani', 'kdyz', 'vsak', 'ne', 'jsem', 'tento', 'aby', 'jsme', 'pred', 'pta', 'jejich', 'byl', 'jeste', 'az', 'bez', 'take', 'pouze', 'prvni', 'vase', 'ktera', 'nas', 'novy', 'pokud', 'muze', 'jeho', 'sve', 'jine', 'zpravy', 'nove', 'neni', 'vas', 'jen', 'podle', 'zde', 'clanek', 'uz', 'byt', 'vice', 'bude', 'jiz', 'nez', 'ktery', 'by', 'ktere', 'co', 'nebo', 'ten', 'tak', 'ma', 'pri', 'od', 'po', 'jsou', 'jak', 'dalsi', 'ale', 'si', 've', 'to', 'jako', 'za', 'zpet', 'ze', 'do', 'pro', 'je', 'na') | |
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords) | |
mydata.dtm <- TermDocumentMatrix(mydata.corpus) | |
mydata.dtm2 <- removeSparseTerms(mydata.dtm, sparse=0.65) | |
mydata.df <- as.data.frame(inspect(mydata.dtm2)) | |
mydata.df.scale <- scale(mydata.df) | |
d <- dist(mydata.df.scale, method = "euclidean") # distance matrix | |
fit <- hclust(d, method="ward") | |
plot(fit) # display dendogram? | |
#korelace | |
dtm <- DocumentTermMatrix(mydata.corpus) | |
dtm <- removeSparseTerms(dtm, 0.65) | |
##get the top ten words | |
top10 <- as.matrix(dtm) | |
v <- apply(top10,2,sum) | |
v <- sort(v, decreasing = TRUE) | |
v1 <- sort(v[1:10]) | |
barplot(v1, horiz=TRUE, cex.names = 0.7, las = 1, col=grey.colors(10), main="Frequency of Terms") | |
words <- names(findAssocs(dtm, "evropske", .2)[2:11]) | |
oi <- as.matrix(dtm) | |
find <- colnames(oi) | |
corr <- cor(oi[,find]) | |
corrplot(corr) | |
#wordcloud | |
ap.tdm <- TermDocumentMatrix(mydata.corpus) | |
ap.m <- as.matrix(ap.tdm) | |
ap.v <- sort(rowSums(ap.m),decreasing=TRUE) | |
ap.d <- data.frame(word = names(ap.v),freq=ap.v) | |
table(ap.d$freq) | |
pal2 <- brewer.pal(8,"Dark2") | |
png("wordcloud_packages.png", width=1024,height=768) | |
wordcloud(ap.d$word,ap.d$freq, scale=c(10,.2),min.freq=3, | |
max.words=150, random.order=FALSE, rot.per=.15, colors=pal2) | |
dev.off() | |
text = 'SPOZ se v Pardubickém a Zlínském kraji dostala do vlády. V Pardubickém se vytvořila koalice s ČSSD a lidovci a ve Zlínském se soc. dem. a komunisty, z čehož vyplývá, že sama SPOZ není ideologicky předpojatá. Ve Zlínském kraji byla volební účast zhruba stejná jako celostátní, tedy 36 procent. A já se ptám, co dělalo těch 64 procent občanů, kteří k volbám nešli? Teď pokřikují a pořádají demonstrace. Podle mého názoru lidé politikou znechucení nejdou k volbám, ale pak nemají právo si stěžovat, že volební výsledky dopadly tak, jak dopadly.' | |
textcat(text) | |
czstemm <- function(x, lang="czech") { | |
x <- toString(x) | |
sent <- sentDetect(x) | |
mydata.vectors <- character(0); | |
for(i in 1:length(sent)) { | |
x=sent[i] | |
text <- curlEscape(x) | |
url <- paste("http://localhost:9200/_analyze?text=",text,"&analyzer=", lang, "&stopwords=TRUE",sep="") | |
textstemm <- fromJSON(getURL(url)) | |
string <- "" | |
for(i in 1:length(textstemm$token)) { | |
string <- paste(string, textstemm$token[[i]][1], sep=" ") | |
} | |
mydata.vectors <- c(string ,mydata.vectors) | |
} | |
rev(mydata.vectors) | |
} | |
req = list(text=textklemm) | |
response <- fromJSON(getURL( | |
'http://cslemm.weps.cz/', | |
customrequest='POST', | |
httpheader=c('Content-Type'='application/json'), | |
postfields=toJSON(req))) | |
response$result | |
#ngram | |
library(textcat) | |
library(tau) | |
klaus <- Corpus(DirSource("klaus"), readerControl = list(language = "cz")) | |
textyklaus <- textcnt(klaus, method = "string",n=3) | |
sort(textyklaus, decreasing=TRUE)[1:50] | |
library(RCurl) | |
library(RJSONIO) | |
library(textcat) | |
library(tm) | |
library(tau) | |
library(openNLP) | |
#ncd | |
ncd <- function(file1, file2) { | |
both <- "" | |
file1 <- toString(file1) | |
file2 <- toString(file2) | |
comp1 <- memCompress(file1, type=c(c("bzip2"))) | |
comp2 <- memCompress(file2, type=c(c("bzip2"))) | |
lencomp1 = length(comp1) | |
lencomp2 = length(comp2) | |
both <- paste(both, file1, file2, sep="") | |
compboth <- memCompress(both, type=c(c("bzip2"))) | |
lencompboth = length(compboth) | |
result <- (lencompboth - min(lencomp1, lencomp2))/max(lencomp1,lencomp2) | |
result | |
} | |
#matice | |
mydata.vectors <- character(0) | |
for(i in 1:10) { | |
for(x in 1:10) { | |
ncdhavel <- ncd(havel[[i]],havel[[x]]) | |
mydata.vectors <- c(ncdhavel,mydata.vectors) | |
} | |
} | |
a <- matrix(rev(mydata.vectors), nrow=10) | |
names.vectors <- character(0) | |
for(x in 1:10) { names.vectors <- c(ID(havel[[x]]),names.vectors) } | |
dimnames(a) <- list(rev(names.vectors), rev(names.vectors)) | |
d <- dist(a, method = "euclidean") | |
fit <- hclust(d, method="ward") | |
plot(fit) # display dendogram | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment