Skip to content

Instantly share code, notes, and snippets.

@emraher
Last active August 29, 2015 14:15
Show Gist options
  • Save emraher/09e8333b227078c5bb60 to your computer and use it in GitHub Desktop.
Save emraher/09e8333b227078c5bb60 to your computer and use it in GitHub Desktop.
TBMMGenelKurulu Twitter
acaba
altı
ama
ancak
artık
asla
aslında
az
bana
bazen
bazı
bazıları
bazısı
belki
ben
beni
benim
beş
bile
bir
birçoğu
birçok
birçokları
biri
birisi
birkaç
birkaçı
birşey
birşeyi
biz
bize
bizi
bizim
böyle
böylece
bu
buna
bunda
bundan
bunu
bunun
burada
bütün
çoğu
çoğuna
çoğunu
çok
çünkü
da
daha
de
değil
demek
diğer
diğeri
diğerleri
diye
dokuz
dolayı
dört
elbette
en
fakat
falan
felan
filan
gene
gibi
hâlâ
hangi
hangisi
hani
hatta
hem
henüz
hep
hepsi
hepsine
hepsini
her
her biri
herkes
herkese
herkesi
hiç
hiç kimse
hiçbiri
hiçbirine
hiçbirini
için
içinde
iki
ile
ise
işte
kaç
kadar
kendi
kendine
kendini
ki
kim
kime
kimi
kimin
kimisi
madem
mi
mu
nasıl
ne
ne kadar
ne zaman
neden
nedir
nerde
nerede
nereden
nereye
nesi
neyse
niçin
niye
on
ona
ondan
onlar
onlara
onlardan
onların
onların
onu
onun
orada
oysa
oysaki
öbürü
ön
önce
ötürü
öyle
rağmen
sana
sekiz
sen
senden
seni
senin
siz
sizden
size
sizi
sizin
son
sonra
şayet
şey
şeyden
şeye
şeyi
şeyler
şimdi
şöyle
şu
şuna
şunda
şundan
şunlar
şunu
şunun
tabi
tamam
tüm
tümü
üç
üzere
var
ve
veya
veyahut
ya
ya da
yani
yedi
yerine
yine
yoksa
zaten
zira
rm(list=ls())
# Load required libraries
library(RCurl)
library(stringr)
library(tm)
library(wordcloud)
library(RColorBrewer)
library(twitteR)
library(streamR)
library(grid)
library(ggplot2)
library(wesanderson)
# Load credentials ============================================================= IMPORTANT
# SEE:http://thinktostart.com/twitter-authentification-with-r/
# UNCOMMENT LINES BELOW ======================================================== IMPORTANT
#load("")
#registerTwitterOAuth(my_oauth)
# Load credentials ============================================================= IMPORTANT
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))
# Set seed user
user <- "TBMMGenelKurulu"
# getting data for seed user
seed <- getUser(user)
(seed.n <- seed$screenName)
# Get the timeline
ut <- userTimeline(user, n=3200, includeRts = FALSE, encoding="utf-8")
# Extract tweets
tweets.text <- sapply(ut, function(x) x$getText())
head(tweets.text)
tweets.text[42:50]
# Remove non alphanumeric characters
tweets.text <- gsub("[^a-zA-ZğüşöçıİĞÜŞÖÇ ]","",tweets.text)
# Convert all text to lower case
tweets.text <- tolower(tweets.text)
# Replace @UserName
tweets.text <- gsub("@\\w+", "", tweets.text)
# Remove punctuation
tweets.text <- gsub("[[:punct:]]", "", tweets.text)
# Remove links
tweets.text <- gsub("http\\w+", "", tweets.text)
# Remove tabs
tweets.text <- gsub("[ |\t]{2,}", "", tweets.text)
# Remove blank spaces at the beginning
tweets.text <- gsub("^ ", "", tweets.text)
# Remove blank spaces at the end
tweets.text <- gsub(" $", "", tweets.text)
# Replace AK Parti with akp
tweets.text <- gsub("ak parti", "akp", tweets.text)
# Create corpus
tweets.text.corpus <- Corpus(VectorSource(tweets.text))
# Clean up by removing stop words
#tweets.text.corpus <- tm_map(tweets.text.corpus, function(x)removeWords(x,stopwords()))
# Turkish stopwords ============================================================ IMPORTANT
# Save stop-words-turkish.txt into your working directory
turkish <- read.table("stop-words-turkish.txt", sep="\n", stringsAsFactors = FALSE)
turkish_stop <- unlist(turkish)
# ============================================================================== IMPORTANT
# Create document term matrix applying some transformations
# To be safe re-clean text
tdm = TermDocumentMatrix(
tweets.text.corpus,
control = list(
removePunctuation = TRUE,
stopwords = c("bir",
"gibi",
"ama",
"daha",
"yok",
"http",
"ben",
"belki",
"hiçbir",
"sen",
"var",
"neden",
"nasi",
"ile",
"nasıl",
"kadar",
"kim",
"için",
"inci",
"uncu"),
turkish_stop,
removeNumbers = TRUE,
tolower = TRUE))
# Create DTM
# Create document term matrix applying some transformations
dtm = DocumentTermMatrix(
tweets.text.corpus,
control = list(
removePunctuation = TRUE,
stopwords = c("bir",
"gibi",
"ama",
"daha",
"yok",
"http",
"ben",
"belki",
"hiçbir",
"sen",
"var",
"neden",
"nasi",
"ile",
"nasıl",
"kadar",
"kim",
"için",
"inci",
"uncu"),
turkish_stop,
removeNumbers = TRUE,
tolower = TRUE))
# Assocsiations
findAssocs(dtm, "akp", corlimit=0.15)
findAssocs(dtm, "bdp", corlimit=0.15)
findAssocs(dtm, "chp", corlimit=0.15)
findAssocs(dtm, "hdp", corlimit=0.15)
findAssocs(dtm, "mhp", corlimit=0.15)
# AKP Correlation
toi <- "akp" # term of interest
corlimit <- 0.15 # lower correlation bound limit.
akp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
terms = row.names(findAssocs(tdm, toi, corlimit)))
akp_0.3$terms <- factor(akp_0.3$terms, levels = akp_0.3$terms)
# Plot and save the image in png format
png("akp.png", width=9, height=9, units="in", res=500)
ggplot(akp_0.3, aes( y = terms ) ) +
geom_point(aes(x = corr, size=corr), data = akp_0.3) +
scale_size(range = c(3, 15)) +
ylab("")+
xlab(paste0("Correlation with the term ", "\"", toi, "\""))
dev.off()
# BDP Correlation
toi <- "bdp" # term of interest
corlimit <- 0.15 # lower correlation bound limit.
bdp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
terms = row.names(findAssocs(tdm, toi, corlimit)))
bdp_0.3$terms <- factor(bdp_0.3$terms, levels = bdp_0.3$terms)
# Plot and save the image in png format
png("bdp.png", width=9, height=9, units="in", res=500)
ggplot(bdp_0.3, aes( y = terms ) ) +
geom_point(aes(x = corr, size=corr), data = bdp_0.3) +
scale_size(range = c(3, 15)) +
ylab("")+
xlab(paste0("Correlation with the term ", "\"", toi, "\""))
dev.off()
# CHP Correlation
toi <- "chp" # term of interest
corlimit <- 0.15 # lower correlation bound limit.
chp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
terms = row.names(findAssocs(tdm, toi, corlimit)))
chp_0.3$terms <- factor(chp_0.3$terms, levels = chp_0.3$terms)
# Plot and save the image in png format
png("chp.png", width=9, height=9, units="in", res=500)
ggplot(chp_0.3, aes( y = terms ) ) +
geom_point(aes(x = corr, size=corr), data = chp_0.3) +
scale_size(range = c(3, 15)) +
ylab("")+
xlab(paste0("Correlation with the term ", "\"", toi, "\""))
dev.off()
# HDP Correlation
toi <- "hdp" # term of interest
corlimit <- 0.15 # lower correlation bound limit.
hdp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
terms = row.names(findAssocs(tdm, toi, corlimit)))
hdp_0.3$terms <- factor(hdp_0.3$terms, levels = hdp_0.3$terms)
# Plot and save the image in png format
png("hdp.png", width=9, height=9, units="in", res=500)
ggplot(hdp_0.3, aes( y = terms ) ) +
geom_point(aes(x = corr, size=corr), data = hdp_0.3) +
scale_size(range = c(3, 15)) +
ylab("")+
xlab(paste0("Correlation with the term ", "\"", toi, "\""))
dev.off()
# MHP Correlation
toi <- "mhp" # term of interest
corlimit <- 0.15 # lower correlation bound limit.
mhp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
terms = row.names(findAssocs(tdm, toi, corlimit)))
mhp_0.3$terms <- factor(mhp_0.3$terms, levels = mhp_0.3$terms)
# Plot and save the image in png format
png("mhp.png", width=9, height=9, units="in", res=500)
ggplot(mhp_0.3, aes( y = terms ) ) +
geom_point(aes(x = corr, size=corr), data = mhp_0.3) +
scale_size(range = c(3, 15)) +
ylab("")+
xlab(paste0("Correlation with the term ", "\"", toi, "\""))
dev.off()
# WORDCLOUD
# Define tdm as matrix
m = as.matrix(tdm)
# Get word counts in decreasing order
word_freqs = sort(rowSums(m), decreasing=TRUE)
# Create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)
# Color
la_cont <- wes_palette(name = "Zissou", type = "continuous")
# Plot and save the image in png format
png("tbmm.png", width=9, height=9, units="in", res=500)
wordcloud(dm$word, dm$freq, random.order=FALSE, min.freq = 2,scale=c(4,0.5), max.words = 100, colors=la_cont)
dev.off()
# Save workspace
save.image(file = "tbmm.RData")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment