Last active
August 29, 2015 14:15
-
-
Save emraher/09e8333b227078c5bb60 to your computer and use it in GitHub Desktop.
TBMMGenelKurulu Twitter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
acaba | |
altı | |
ama | |
ancak | |
artık | |
asla | |
aslında | |
az | |
bana | |
bazen | |
bazı | |
bazıları | |
bazısı | |
belki | |
ben | |
beni | |
benim | |
beş | |
bile | |
bir | |
birçoğu | |
birçok | |
birçokları | |
biri | |
birisi | |
birkaç | |
birkaçı | |
birşey | |
birşeyi | |
biz | |
bize | |
bizi | |
bizim | |
böyle | |
böylece | |
bu | |
buna | |
bunda | |
bundan | |
bunu | |
bunun | |
burada | |
bütün | |
çoğu | |
çoğuna | |
çoğunu | |
çok | |
çünkü | |
da | |
daha | |
de | |
değil | |
demek | |
diğer | |
diğeri | |
diğerleri | |
diye | |
dokuz | |
dolayı | |
dört | |
elbette | |
en | |
fakat | |
falan | |
felan | |
filan | |
gene | |
gibi | |
hâlâ | |
hangi | |
hangisi | |
hani | |
hatta | |
hem | |
henüz | |
hep | |
hepsi | |
hepsine | |
hepsini | |
her | |
her biri | |
herkes | |
herkese | |
herkesi | |
hiç | |
hiç kimse | |
hiçbiri | |
hiçbirine | |
hiçbirini | |
için | |
içinde | |
iki | |
ile | |
ise | |
işte | |
kaç | |
kadar | |
kendi | |
kendine | |
kendini | |
ki | |
kim | |
kime | |
kimi | |
kimin | |
kimisi | |
madem | |
mı | |
mi | |
mu | |
mü | |
nasıl | |
ne | |
ne kadar | |
ne zaman | |
neden | |
nedir | |
nerde | |
nerede | |
nereden | |
nereye | |
nesi | |
neyse | |
niçin | |
niye | |
on | |
ona | |
ondan | |
onlar | |
onlara | |
onlardan | |
onların | |
onların | |
onu | |
onun | |
orada | |
oysa | |
oysaki | |
öbürü | |
ön | |
önce | |
ötürü | |
öyle | |
rağmen | |
sana | |
sekiz | |
sen | |
senden | |
seni | |
senin | |
siz | |
sizden | |
size | |
sizi | |
sizin | |
son | |
sonra | |
şayet | |
şey | |
şeyden | |
şeye | |
şeyi | |
şeyler | |
şimdi | |
şöyle | |
şu | |
şuna | |
şunda | |
şundan | |
şunlar | |
şunu | |
şunun | |
tabi | |
tamam | |
tüm | |
tümü | |
üç | |
üzere | |
var | |
ve | |
veya | |
veyahut | |
ya | |
ya da | |
yani | |
yedi | |
yerine | |
yine | |
yoksa | |
zaten | |
zira |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rm(list=ls()) | |
# Load required libraries | |
library(RCurl) | |
library(stringr) | |
library(tm) | |
library(wordcloud) | |
library(RColorBrewer) | |
library(twitteR) | |
library(streamR) | |
library(grid) | |
library(ggplot2) | |
library(wesanderson) | |
# Load credentials ============================================================= IMPORTANT | |
# SEE:http://thinktostart.com/twitter-authentification-with-r/ | |
# UNCOMMENT LINES BELOW ======================================================== IMPORTANT | |
#load("") | |
#registerTwitterOAuth(my_oauth) | |
# Load credentials ============================================================= IMPORTANT | |
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl"))) | |
# Set seed user | |
user <- "TBMMGenelKurulu" | |
# getting data for seed user | |
seed <- getUser(user) | |
(seed.n <- seed$screenName) | |
# Get the timeline | |
ut <- userTimeline(user, n=3200, includeRts = FALSE, encoding="utf-8") | |
# Extract tweets | |
tweets.text <- sapply(ut, function(x) x$getText()) | |
head(tweets.text) | |
tweets.text[42:50] | |
# Remove non alphanumeric characters | |
tweets.text <- gsub("[^a-zA-ZğüşöçıİĞÜŞÖÇ ]","",tweets.text) | |
# Convert all text to lower case | |
tweets.text <- tolower(tweets.text) | |
# Replace @UserName | |
tweets.text <- gsub("@\\w+", "", tweets.text) | |
# Remove punctuation | |
tweets.text <- gsub("[[:punct:]]", "", tweets.text) | |
# Remove links | |
tweets.text <- gsub("http\\w+", "", tweets.text) | |
# Remove tabs | |
tweets.text <- gsub("[ |\t]{2,}", "", tweets.text) | |
# Remove blank spaces at the beginning | |
tweets.text <- gsub("^ ", "", tweets.text) | |
# Remove blank spaces at the end | |
tweets.text <- gsub(" $", "", tweets.text) | |
# Replace AK Parti with akp | |
tweets.text <- gsub("ak parti", "akp", tweets.text) | |
# Create corpus | |
tweets.text.corpus <- Corpus(VectorSource(tweets.text)) | |
# Clean up by removing stop words | |
#tweets.text.corpus <- tm_map(tweets.text.corpus, function(x)removeWords(x,stopwords())) | |
# Turkish stopwords ============================================================ IMPORTANT | |
# Save stop-words-turkish.txt into your working directory | |
turkish <- read.table("stop-words-turkish.txt", sep="\n", stringsAsFactors = FALSE) | |
turkish_stop <- unlist(turkish) | |
# ============================================================================== IMPORTANT | |
# Create document term matrix applying some transformations | |
# To be safe re-clean text | |
tdm = TermDocumentMatrix( | |
tweets.text.corpus, | |
control = list( | |
removePunctuation = TRUE, | |
stopwords = c("bir", | |
"gibi", | |
"ama", | |
"daha", | |
"yok", | |
"http", | |
"ben", | |
"belki", | |
"hiçbir", | |
"sen", | |
"var", | |
"neden", | |
"nasi", | |
"ile", | |
"nasıl", | |
"kadar", | |
"kim", | |
"için", | |
"inci", | |
"uncu"), | |
turkish_stop, | |
removeNumbers = TRUE, | |
tolower = TRUE)) | |
# Create DTM | |
# Create document term matrix applying some transformations | |
dtm = DocumentTermMatrix( | |
tweets.text.corpus, | |
control = list( | |
removePunctuation = TRUE, | |
stopwords = c("bir", | |
"gibi", | |
"ama", | |
"daha", | |
"yok", | |
"http", | |
"ben", | |
"belki", | |
"hiçbir", | |
"sen", | |
"var", | |
"neden", | |
"nasi", | |
"ile", | |
"nasıl", | |
"kadar", | |
"kim", | |
"için", | |
"inci", | |
"uncu"), | |
turkish_stop, | |
removeNumbers = TRUE, | |
tolower = TRUE)) | |
# Assocsiations | |
findAssocs(dtm, "akp", corlimit=0.15) | |
findAssocs(dtm, "bdp", corlimit=0.15) | |
findAssocs(dtm, "chp", corlimit=0.15) | |
findAssocs(dtm, "hdp", corlimit=0.15) | |
findAssocs(dtm, "mhp", corlimit=0.15) | |
# AKP Correlation | |
toi <- "akp" # term of interest | |
corlimit <- 0.15 # lower correlation bound limit. | |
akp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1], | |
terms = row.names(findAssocs(tdm, toi, corlimit))) | |
akp_0.3$terms <- factor(akp_0.3$terms, levels = akp_0.3$terms) | |
# Plot and save the image in png format | |
png("akp.png", width=9, height=9, units="in", res=500) | |
ggplot(akp_0.3, aes( y = terms ) ) + | |
geom_point(aes(x = corr, size=corr), data = akp_0.3) + | |
scale_size(range = c(3, 15)) + | |
ylab("")+ | |
xlab(paste0("Correlation with the term ", "\"", toi, "\"")) | |
dev.off() | |
# BDP Correlation | |
toi <- "bdp" # term of interest | |
corlimit <- 0.15 # lower correlation bound limit. | |
bdp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1], | |
terms = row.names(findAssocs(tdm, toi, corlimit))) | |
bdp_0.3$terms <- factor(bdp_0.3$terms, levels = bdp_0.3$terms) | |
# Plot and save the image in png format | |
png("bdp.png", width=9, height=9, units="in", res=500) | |
ggplot(bdp_0.3, aes( y = terms ) ) + | |
geom_point(aes(x = corr, size=corr), data = bdp_0.3) + | |
scale_size(range = c(3, 15)) + | |
ylab("")+ | |
xlab(paste0("Correlation with the term ", "\"", toi, "\"")) | |
dev.off() | |
# CHP Correlation | |
toi <- "chp" # term of interest | |
corlimit <- 0.15 # lower correlation bound limit. | |
chp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1], | |
terms = row.names(findAssocs(tdm, toi, corlimit))) | |
chp_0.3$terms <- factor(chp_0.3$terms, levels = chp_0.3$terms) | |
# Plot and save the image in png format | |
png("chp.png", width=9, height=9, units="in", res=500) | |
ggplot(chp_0.3, aes( y = terms ) ) + | |
geom_point(aes(x = corr, size=corr), data = chp_0.3) + | |
scale_size(range = c(3, 15)) + | |
ylab("")+ | |
xlab(paste0("Correlation with the term ", "\"", toi, "\"")) | |
dev.off() | |
# HDP Correlation | |
toi <- "hdp" # term of interest | |
corlimit <- 0.15 # lower correlation bound limit. | |
hdp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1], | |
terms = row.names(findAssocs(tdm, toi, corlimit))) | |
hdp_0.3$terms <- factor(hdp_0.3$terms, levels = hdp_0.3$terms) | |
# Plot and save the image in png format | |
png("hdp.png", width=9, height=9, units="in", res=500) | |
ggplot(hdp_0.3, aes( y = terms ) ) + | |
geom_point(aes(x = corr, size=corr), data = hdp_0.3) + | |
scale_size(range = c(3, 15)) + | |
ylab("")+ | |
xlab(paste0("Correlation with the term ", "\"", toi, "\"")) | |
dev.off() | |
# MHP Correlation | |
toi <- "mhp" # term of interest | |
corlimit <- 0.15 # lower correlation bound limit. | |
mhp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1], | |
terms = row.names(findAssocs(tdm, toi, corlimit))) | |
mhp_0.3$terms <- factor(mhp_0.3$terms, levels = mhp_0.3$terms) | |
# Plot and save the image in png format | |
png("mhp.png", width=9, height=9, units="in", res=500) | |
ggplot(mhp_0.3, aes( y = terms ) ) + | |
geom_point(aes(x = corr, size=corr), data = mhp_0.3) + | |
scale_size(range = c(3, 15)) + | |
ylab("")+ | |
xlab(paste0("Correlation with the term ", "\"", toi, "\"")) | |
dev.off() | |
# WORDCLOUD | |
# Define tdm as matrix | |
m = as.matrix(tdm) | |
# Get word counts in decreasing order | |
word_freqs = sort(rowSums(m), decreasing=TRUE) | |
# Create a data frame with words and their frequencies | |
dm = data.frame(word=names(word_freqs), freq=word_freqs) | |
# Color | |
la_cont <- wes_palette(name = "Zissou", type = "continuous") | |
# Plot and save the image in png format | |
png("tbmm.png", width=9, height=9, units="in", res=500) | |
wordcloud(dm$word, dm$freq, random.order=FALSE, min.freq = 2,scale=c(4,0.5), max.words = 100, colors=la_cont) | |
dev.off() | |
# Save workspace | |
save.image(file = "tbmm.RData") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment