Skip to content

Instantly share code, notes, and snippets.

@josefslerka
Last active October 13, 2015 05:48
Show Gist options
  • Save josefslerka/4148592 to your computer and use it in GitHub Desktop.
Save josefslerka/4148592 to your computer and use it in GitHub Desktop.
Digital Humanities 7
library(tm)
library(textcat)
library(tm)
library(tau)
library(openNLP)
inspect(havel[1:2])
havel[[1]] #pristup ke konkretnimu dokumentu
havel <- tm_map(havel, stripWhitespace)
havel <- tm_map(havel, tolower)
dtmHavel <- DocumentTermMatrix(corpusHavel)
# build a term-document matrix
mydata.corpus <- iconv(mydata.corpus, from="utf8", to="ASCII//TRANSLIT")
mydata.corpus <- gsub("\'", "", mydata.corpus)
mydata.corpus <- gsub("\"", "", mydata.corpus)
mydata.corpus <- gsub("\\n", "", mydata.corpus)
####
library(tm)
corpusHavel <- Corpus(DirSource("havel", encoding = "UTF-8") , readerControl = list(language = "cz"))
corpusKlaus <- Corpus(DirSource("klaus"), readerControl = list(language = "cz"))
mydata.corpus <- corpusHavel
mydata.corpus <- iconv(mydata.corpus, from="utf8", to="ASCII//TRANSLIT")
mydata.corpus <- gsub("\'", "", mydata.corpus)
mydata.corpus <- gsub("\"", "", mydata.corpus)
mydata.corpus <- gsub("\\n", "", mydata.corpus)
mydata.corpus <- Corpus(VectorSource(mydata.corpus))
mydata.corpus <- tm_map(mydata.corpus, tolower)
mydata.corpus <- tm_map(mydata.corpus, removePunctuation)
my_stopwords <- c(stopwords('english'), 'se', 'na', 'v', 'co', 'ze',
'bych','dekuji','kdy','tomu','totiz',
'panove', 'vazeni', 'svou', 'velmi',
'abych','toto',
'o', 'je', 'k', 'z', 'proti', 'neni','byly', 'si', 'dnes', 'cz', 'timto', 'budes', 'budem', 'byli', 'jses', 'muj', 'svym', 'ta', 'tomto', 'tohle', 'tuto', 'tyto', 'jej', 'zda', 'proc', 'mate', 'tato', 'kam', 'tohoto', 'kdo', 'kteri', 'mi', 'nam', 'tom', 'tomuto', 'mit', 'nic', 'proto', 'kterou', 'byla', 'toho', 'protoze', 'asi', 'ho', 'nasi', 'napiste', 're', 'rt', 'coz', 'tim', 'takze', 'svych', 'jeji', 'svymi', 'jste', 'aj', 'tu', 'tedy', 'teto', 'bylo', 'kde', 'ke', 'prave', 'ji', 'nad', 'nejsou', 'ci', 'pod', 'tema', 'mezi', 'pres', 'ty', 'pak', 'vam', 'ani', 'kdyz', 'vsak', 'ne', 'jsem', 'tento', 'aby', 'jsme', 'pred', 'pta', 'jejich', 'byl', 'jeste', 'az', 'bez', 'take', 'pouze', 'prvni', 'vase', 'ktera', 'nas', 'novy', 'pokud', 'muze', 'jeho', 'sve', 'jine', 'zpravy', 'nove', 'neni', 'vas', 'jen', 'podle', 'zde', 'clanek', 'uz', 'byt', 'vice', 'bude', 'jiz', 'nez', 'ktery', 'by', 'ktere', 'co', 'nebo', 'ten', 'tak', 'ma', 'pri', 'od', 'po', 'jsou', 'jak', 'dalsi', 'ale', 'si', 've', 'to', 'jako', 'za', 'zpet', 'ze', 'do', 'pro', 'je', 'na')
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords)
mydata.dtm <- TermDocumentMatrix(mydata.corpus)
mydata.dtm2 <- removeSparseTerms(mydata.dtm, sparse=0.65)
mydata.df <- as.data.frame(inspect(mydata.dtm2))
mydata.df.scale <- scale(mydata.df)
d <- dist(mydata.df.scale, method = "euclidean") # distance matrix
fit <- hclust(d, method="ward")
plot(fit) # display dendogram?
#korelace
dtm <- DocumentTermMatrix(mydata.corpus)
dtm <- removeSparseTerms(dtm, 0.65)
##get the top ten words
top10 <- as.matrix(dtm)
v <- apply(top10,2,sum)
v <- sort(v, decreasing = TRUE)
v1 <- sort(v[1:10])
barplot(v1, horiz=TRUE, cex.names = 0.7, las = 1, col=grey.colors(10), main="Frequency of Terms")
words <- names(findAssocs(dtm, "evropske", .2)[2:11])
oi <- as.matrix(dtm)
find <- colnames(oi)
corr <- cor(oi[,find])
corrplot(corr)
#wordcloud
ap.tdm <- TermDocumentMatrix(mydata.corpus)
ap.m <- as.matrix(ap.tdm)
ap.v <- sort(rowSums(ap.m),decreasing=TRUE)
ap.d <- data.frame(word = names(ap.v),freq=ap.v)
table(ap.d$freq)
pal2 <- brewer.pal(8,"Dark2")
png("wordcloud_packages.png", width=1024,height=768)
wordcloud(ap.d$word,ap.d$freq, scale=c(10,.2),min.freq=3,
max.words=150, random.order=FALSE, rot.per=.15, colors=pal2)
dev.off()
text = 'SPOZ se v Pardubickém a Zlínském kraji dostala do vlády. V Pardubickém se vytvořila koalice s ČSSD a lidovci a ve Zlínském se soc. dem. a komunisty, z čehož vyplývá, že sama SPOZ není ideologicky předpojatá. Ve Zlínském kraji byla volební účast zhruba stejná jako celostátní, tedy 36 procent. A já se ptám, co dělalo těch 64 procent občanů, kteří k volbám nešli? Teď pokřikují a pořádají demonstrace. Podle mého názoru lidé politikou znechucení nejdou k volbám, ale pak nemají právo si stěžovat, že volební výsledky dopadly tak, jak dopadly.'
textcat(text)
czstemm <- function(x, lang="czech") {
x <- toString(x)
sent <- sentDetect(x)
mydata.vectors <- character(0);
for(i in 1:length(sent)) {
x=sent[i]
text <- curlEscape(x)
url <- paste("http://localhost:9200/_analyze?text=",text,"&analyzer=", lang, "&stopwords=TRUE",sep="")
textstemm <- fromJSON(getURL(url))
string <- ""
for(i in 1:length(textstemm$token)) {
string <- paste(string, textstemm$token[[i]][1], sep=" ")
}
mydata.vectors <- c(string ,mydata.vectors)
}
rev(mydata.vectors)
}
req = list(text=textklemm)
response <- fromJSON(getURL(
'http://cslemm.weps.cz/',
customrequest='POST',
httpheader=c('Content-Type'='application/json'),
postfields=toJSON(req)))
response$result
#ngram
library(textcat)
library(tau)
klaus <- Corpus(DirSource("klaus"), readerControl = list(language = "cz"))
textyklaus <- textcnt(klaus, method = "string",n=3)
sort(textyklaus, decreasing=TRUE)[1:50]
library(RCurl)
library(RJSONIO)
library(textcat)
library(tm)
library(tau)
library(openNLP)
#ncd
ncd <- function(file1, file2) {
both <- ""
file1 <- toString(file1)
file2 <- toString(file2)
comp1 <- memCompress(file1, type=c(c("bzip2")))
comp2 <- memCompress(file2, type=c(c("bzip2")))
lencomp1 = length(comp1)
lencomp2 = length(comp2)
both <- paste(both, file1, file2, sep="")
compboth <- memCompress(both, type=c(c("bzip2")))
lencompboth = length(compboth)
result <- (lencompboth - min(lencomp1, lencomp2))/max(lencomp1,lencomp2)
result
}
#matice
mydata.vectors <- character(0)
for(i in 1:10) {
for(x in 1:10) {
ncdhavel <- ncd(havel[[i]],havel[[x]])
mydata.vectors <- c(ncdhavel,mydata.vectors)
}
}
a <- matrix(rev(mydata.vectors), nrow=10)
names.vectors <- character(0)
for(x in 1:10) { names.vectors <- c(ID(havel[[x]]),names.vectors) }
dimnames(a) <- list(rev(names.vectors), rev(names.vectors))
d <- dist(a, method = "euclidean")
fit <- hclust(d, method="ward")
plot(fit) # display dendogram
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment