emraher · August 29, 2015 14:15
diff --git a/stop-words-turkish.txt b/stop-words-turkish.txt
 acaba
 altı
 ama
 ancak
 artık
 asla
 aslında
 az
 bana
 bazen
 bazı
 bazıları
 bazısı
 belki
 ben
 beni
 benim
 beş
 bile
 bir
 birçoğu
 birçok
 birçokları
 biri
 birisi
 birkaç
 birkaçı
 birşey
 birşeyi
 biz
 bize
 bizi
 bizim
 böyle
 böylece
 bu
 buna
 bunda
 bundan
 bunu
 bunun
 burada
 bütün
 çoğu
 çoğuna
 çoğunu
 çok
 çünkü
 da
 daha
 de
 değil
 demek
 diğer
 diğeri
 diğerleri
 diye
 dokuz
 dolayı
 dört
 elbette
 en
 fakat
 falan
 felan
 filan
 gene
 gibi
 hâlâ
 hangi
 hangisi
 hani
 hatta
 hem
 henüz
 hep
 hepsi
 hepsine
 hepsini
 her
 her biri
 herkes
 herkese
 herkesi
 hiç
 hiç kimse
 hiçbiri
 hiçbirine
 hiçbirini
 için
 içinde
 iki
 ile
 ise
 işte
 kaç
 kadar
 kendi
 kendine
 kendini
 ki
 kim
 kime
 kimi
 kimin
 kimisi
 madem
 mı
 mi
 mu
 mü
 nasıl
 ne
 ne kadar
 ne zaman
 neden
 nedir
 nerde
 nerede
 nereden
 nereye
 nesi
 neyse
 niçin
 niye
 on
 ona
 ondan
 onlar
 onlara
 onlardan
 onların
 onların
 onu
 onun
 orada
 oysa
 oysaki
 öbürü
 ön
 önce
 ötürü
 öyle
 rağmen
 sana
 sekiz
 sen
 senden
 seni
 senin
 siz
 sizden
 size
 sizi
 sizin
 son
 sonra
 şayet
 şey
 şeyden
 şeye
 şeyi
 şeyler
 şimdi
 şöyle
 şu
 şuna
 şunda
 şundan
 şunlar
 şunu
 şunun
 tabi
 tamam
 tüm
 tümü
 üç
 üzere
 var
 ve
 veya
 veyahut
 ya
 ya da
 yani
 yedi
 yerine
 yine
 yoksa
 zaten
 zira
diff --git a/TBMM.R b/TBMM.R
 rm(list=ls())

 # Load required libraries
 library(RCurl)
 library(stringr)
 library(tm)
 library(wordcloud)
 library(RColorBrewer)
 library(twitteR)
 library(streamR)
 library(grid)
 library(ggplot2)
 library(wesanderson)

 # Load credentials ============================================================= IMPORTANT
 # SEE:http://thinktostart.com/twitter-authentification-with-r/
 # UNCOMMENT LINES BELOW ======================================================== IMPORTANT
 #load("")
 #registerTwitterOAuth(my_oauth)
 # Load credentials ============================================================= IMPORTANT

 options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))

 # Set seed user
 user <- "TBMMGenelKurulu"

 # getting data for seed user
 seed <- getUser(user)
 (seed.n <- seed$screenName)

 # Get the timeline
 ut <- userTimeline(user, n=3200, includeRts = FALSE, encoding="utf-8")

 # Extract tweets
 tweets.text <- sapply(ut, function(x) x$getText())
 head(tweets.text)
 tweets.text[42:50]

 # Remove non alphanumeric characters
 tweets.text <- gsub("[^a-zA-ZğüşöçıİĞÜŞÖÇ ]","",tweets.text)

 # Convert all text to lower case
 tweets.text <- tolower(tweets.text)

 # Replace @UserName
 tweets.text <- gsub("@\\w+", "", tweets.text)

 # Remove punctuation
 tweets.text <- gsub("[[:punct:]]", "", tweets.text)

 # Remove links
 tweets.text <- gsub("http\\w+", "", tweets.text)

 # Remove tabs
 tweets.text <- gsub("[ |\t]{2,}", "", tweets.text)

 # Remove blank spaces at the beginning
 tweets.text <- gsub("^ ", "", tweets.text)

 # Remove blank spaces at the end
 tweets.text <- gsub(" $", "", tweets.text)

 # Replace AK Parti with akp
 tweets.text <- gsub("ak parti", "akp", tweets.text)

 # Create corpus
 tweets.text.corpus <- Corpus(VectorSource(tweets.text))

 # Clean up by removing stop words
 #tweets.text.corpus <- tm_map(tweets.text.corpus, function(x)removeWords(x,stopwords()))

 # Turkish stopwords ============================================================ IMPORTANT
 # Save stop-words-turkish.txt into your working directory
 turkish <- read.table("stop-words-turkish.txt", sep="\n", stringsAsFactors = FALSE)
 turkish_stop <- unlist(turkish)
 # ============================================================================== IMPORTANT

 # Create document term matrix applying some transformations
 # To be safe re-clean text
 tdm = TermDocumentMatrix(
    tweets.text.corpus,
    control = list(
        removePunctuation = TRUE,
        stopwords = c("bir",
                      "gibi",
                      "ama",
                      "daha",
                      "yok",
                      "http",
                      "ben",
                      "belki",
                      "hiçbir",
                      "sen",
                      "var",
                      "neden",
                      "nasi",
                      "ile",
                      "nasıl",
                      "kadar",
                      "kim",
                      "için",
                      "inci",
                      "uncu"),
        turkish_stop,
        removeNumbers = TRUE,
        tolower = TRUE))


 # Create DTM
 # Create document term matrix applying some transformations
 dtm = DocumentTermMatrix(
    tweets.text.corpus,
    control = list(
        removePunctuation = TRUE,
        stopwords = c("bir",
                      "gibi",
                      "ama",
                      "daha",
                      "yok",
                      "http",
                      "ben",
                      "belki",
                      "hiçbir",
                      "sen",
                      "var",
                      "neden",
                      "nasi",
                      "ile",
                      "nasıl",
                      "kadar",
                      "kim",
                      "için",
                      "inci",
                      "uncu"),
        turkish_stop,
        removeNumbers = TRUE,
        tolower = TRUE))

 # Assocsiations
 findAssocs(dtm, "akp", corlimit=0.15)
 findAssocs(dtm, "bdp", corlimit=0.15)
 findAssocs(dtm, "chp", corlimit=0.15)
 findAssocs(dtm, "hdp", corlimit=0.15)
 findAssocs(dtm, "mhp", corlimit=0.15)

 # AKP Correlation
 toi <- "akp" # term of interest
 corlimit <- 0.15 #  lower correlation bound limit.
 akp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
                      terms = row.names(findAssocs(tdm, toi, corlimit)))


 akp_0.3$terms <- factor(akp_0.3$terms, levels = akp_0.3$terms)

 # Plot and save the image in png format
 png("akp.png", width=9, height=9, units="in", res=500)
 ggplot(akp_0.3, aes( y = terms  ) ) +
    geom_point(aes(x = corr, size=corr), data = akp_0.3) +
    scale_size(range = c(3, 15)) +
    ylab("")+
    xlab(paste0("Correlation with the term ", "\"", toi, "\""))
 dev.off()

 # BDP Correlation
 toi <- "bdp" # term of interest
 corlimit <- 0.15 #  lower correlation bound limit.
 bdp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
                      terms = row.names(findAssocs(tdm, toi, corlimit)))


 bdp_0.3$terms <- factor(bdp_0.3$terms, levels = bdp_0.3$terms)
 # Plot and save the image in png format
 png("bdp.png", width=9, height=9, units="in", res=500)
 ggplot(bdp_0.3, aes( y = terms  ) ) +
    geom_point(aes(x = corr, size=corr), data = bdp_0.3) +
    scale_size(range = c(3, 15)) +
    ylab("")+
    xlab(paste0("Correlation with the term ", "\"", toi, "\""))
 dev.off()

 # CHP Correlation
 toi <- "chp" # term of interest
 corlimit <- 0.15 #  lower correlation bound limit.
 chp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
                      terms = row.names(findAssocs(tdm, toi, corlimit)))


 chp_0.3$terms <- factor(chp_0.3$terms, levels = chp_0.3$terms)

 # Plot and save the image in png format
 png("chp.png", width=9, height=9, units="in", res=500)
 ggplot(chp_0.3, aes( y = terms  ) ) +
    geom_point(aes(x = corr, size=corr), data = chp_0.3) +
    scale_size(range = c(3, 15)) +
    ylab("")+
    xlab(paste0("Correlation with the term ", "\"", toi, "\""))
 dev.off()

 # HDP Correlation
 toi <- "hdp" # term of interest
 corlimit <- 0.15 #  lower correlation bound limit.
 hdp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
                      terms = row.names(findAssocs(tdm, toi, corlimit)))


 hdp_0.3$terms <- factor(hdp_0.3$terms, levels = hdp_0.3$terms)
 # Plot and save the image in png format
 png("hdp.png", width=9, height=9, units="in", res=500)
 ggplot(hdp_0.3, aes( y = terms  ) ) +
    geom_point(aes(x = corr, size=corr), data = hdp_0.3) +
    scale_size(range = c(3, 15)) +
    ylab("")+
    xlab(paste0("Correlation with the term ", "\"", toi, "\""))
 dev.off()

 # MHP Correlation
 toi <- "mhp" # term of interest
 corlimit <- 0.15 #  lower correlation bound limit.
 mhp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
                      terms = row.names(findAssocs(tdm, toi, corlimit)))


 mhp_0.3$terms <- factor(mhp_0.3$terms, levels = mhp_0.3$terms)
 # Plot and save the image in png format
 png("mhp.png", width=9, height=9, units="in", res=500)
 ggplot(mhp_0.3, aes( y = terms  ) ) +
    geom_point(aes(x = corr, size=corr), data = mhp_0.3) +
    scale_size(range = c(3, 15)) +
    ylab("")+
    xlab(paste0("Correlation with the term ", "\"", toi, "\""))
 dev.off()



 # WORDCLOUD
 # Define tdm as matrix
 m = as.matrix(tdm)

 # Get word counts in decreasing order
 word_freqs = sort(rowSums(m), decreasing=TRUE)

 # Create a data frame with words and their frequencies
 dm = data.frame(word=names(word_freqs), freq=word_freqs)

 # Color
 la_cont <- wes_palette(name = "Zissou", type = "continuous")

 # Plot and save the image in png format
 png("tbmm.png", width=9, height=9, units="in", res=500)

 wordcloud(dm$word, dm$freq, random.order=FALSE, min.freq = 2,scale=c(4,0.5), max.words = 100, colors=la_cont)

 dev.off()


 # Save workspace
 save.image(file = "tbmm.RData")
	acaba
	altı
	ama
	ancak
	artık
	asla
	aslında
	az
	bana
	bazen
	bazı
	bazıları
	bazısı
	belki
	ben
	beni
	benim
	beş
	bile
	bir
	birçoğu
	birçok
	birçokları
	biri
	birisi
	birkaç
	birkaçı
	birşey
	birşeyi
	biz
	bize
	bizi
	bizim
	böyle
	böylece
	bu
	buna
	bunda
	bundan
	bunu
	bunun
	burada
	bütün
	çoğu
	çoğuna
	çoğunu
	çok
	çünkü
	da
	daha
	de
	değil
	demek
	diğer
	diğeri
	diğerleri
	diye
	dokuz
	dolayı
	dört
	elbette
	en
	fakat
	falan
	felan
	filan
	gene
	gibi
	hâlâ
	hangi
	hangisi
	hani
	hatta
	hem
	henüz
	hep
	hepsi
	hepsine
	hepsini
	her
	her biri
	herkes
	herkese
	herkesi
	hiç
	hiç kimse
	hiçbiri
	hiçbirine
	hiçbirini
	için
	içinde
	iki
	ile
	ise
	işte
	kaç
	kadar
	kendi
	kendine
	kendini
	ki
	kim
	kime
	kimi
	kimin
	kimisi
	madem
	mı
	mi
	mu
	mü
	nasıl
	ne
	ne kadar
	ne zaman
	neden
	nedir
	nerde
	nerede
	nereden
	nereye
	nesi
	neyse
	niçin
	niye
	on
	ona
	ondan
	onlar
	onlara
	onlardan
	onların
	onların
	onu
	onun
	orada
	oysa
	oysaki
	öbürü
	ön
	önce
	ötürü
	öyle
	rağmen
	sana
	sekiz
	sen
	senden
	seni
	senin
	siz
	sizden
	size
	sizi
	sizin
	son
	sonra
	şayet
	şey
	şeyden
	şeye
	şeyi
	şeyler
	şimdi
	şöyle
	şu
	şuna
	şunda
	şundan
	şunlar
	şunu
	şunun
	tabi
	tamam
	tüm
	tümü
	üç
	üzere
	var
	ve
	veya
	veyahut
	ya
	ya da
	yani
	yedi
	yerine
	yine
	yoksa
	zaten
	zira
	rm(list=ls())

	# Load required libraries
	library(RCurl)
	library(stringr)
	library(tm)
	library(wordcloud)
	library(RColorBrewer)
	library(twitteR)
	library(streamR)
	library(grid)
	library(ggplot2)
	library(wesanderson)

	# Load credentials ============================================================= IMPORTANT
	# SEE:http://thinktostart.com/twitter-authentification-with-r/
	# UNCOMMENT LINES BELOW ======================================================== IMPORTANT
	#load("")
	#registerTwitterOAuth(my_oauth)
	# Load credentials ============================================================= IMPORTANT

	options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))

	# Set seed user
	user <- "TBMMGenelKurulu"

	# getting data for seed user
	seed <- getUser(user)
	(seed.n <- seed$screenName)

	# Get the timeline
	ut <- userTimeline(user, n=3200, includeRts = FALSE, encoding="utf-8")

	# Extract tweets
	tweets.text <- sapply(ut, function(x) x$getText())
	head(tweets.text)
	tweets.text[42:50]

	# Remove non alphanumeric characters
	tweets.text <- gsub("[^a-zA-ZğüşöçıİĞÜŞÖÇ ]","",tweets.text)

	# Convert all text to lower case
	tweets.text <- tolower(tweets.text)

	# Replace @UserName
	tweets.text <- gsub("@\\w+", "", tweets.text)

	# Remove punctuation
	tweets.text <- gsub("[[:punct:]]", "", tweets.text)

	# Remove links
	tweets.text <- gsub("http\\w+", "", tweets.text)

	# Remove tabs
	tweets.text <- gsub("[ \|\t]{2,}", "", tweets.text)

	# Remove blank spaces at the beginning
	tweets.text <- gsub("^ ", "", tweets.text)

	# Remove blank spaces at the end
	tweets.text <- gsub(" $", "", tweets.text)

	# Replace AK Parti with akp
	tweets.text <- gsub("ak parti", "akp", tweets.text)

	# Create corpus
	tweets.text.corpus <- Corpus(VectorSource(tweets.text))

	# Clean up by removing stop words
	#tweets.text.corpus <- tm_map(tweets.text.corpus, function(x)removeWords(x,stopwords()))

	# Turkish stopwords ============================================================ IMPORTANT
	# Save stop-words-turkish.txt into your working directory
	turkish <- read.table("stop-words-turkish.txt", sep="\n", stringsAsFactors = FALSE)
	turkish_stop <- unlist(turkish)
	# ============================================================================== IMPORTANT

	# Create document term matrix applying some transformations
	# To be safe re-clean text
	tdm = TermDocumentMatrix(
	tweets.text.corpus,
	control = list(
	removePunctuation = TRUE,
	stopwords = c("bir",
	"gibi",
	"ama",
	"daha",
	"yok",
	"http",
	"ben",
	"belki",
	"hiçbir",
	"sen",
	"var",
	"neden",
	"nasi",
	"ile",
	"nasıl",
	"kadar",
	"kim",
	"için",
	"inci",
	"uncu"),
	turkish_stop,
	removeNumbers = TRUE,
	tolower = TRUE))


	# Create DTM
	# Create document term matrix applying some transformations
	dtm = DocumentTermMatrix(
	tweets.text.corpus,
	control = list(
	removePunctuation = TRUE,
	stopwords = c("bir",
	"gibi",
	"ama",
	"daha",
	"yok",
	"http",
	"ben",
	"belki",
	"hiçbir",
	"sen",
	"var",
	"neden",
	"nasi",
	"ile",
	"nasıl",
	"kadar",
	"kim",
	"için",
	"inci",
	"uncu"),
	turkish_stop,
	removeNumbers = TRUE,
	tolower = TRUE))

	# Assocsiations
	findAssocs(dtm, "akp", corlimit=0.15)
	findAssocs(dtm, "bdp", corlimit=0.15)
	findAssocs(dtm, "chp", corlimit=0.15)
	findAssocs(dtm, "hdp", corlimit=0.15)
	findAssocs(dtm, "mhp", corlimit=0.15)

	# AKP Correlation
	toi <- "akp" # term of interest
	corlimit <- 0.15 # lower correlation bound limit.
	akp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
	terms = row.names(findAssocs(tdm, toi, corlimit)))


	akp_0.3$terms <- factor(akp_0.3$terms, levels = akp_0.3$terms)

	# Plot and save the image in png format
	png("akp.png", width=9, height=9, units="in", res=500)
	ggplot(akp_0.3, aes( y = terms ) ) +
	geom_point(aes(x = corr, size=corr), data = akp_0.3) +
	scale_size(range = c(3, 15)) +
	ylab("")+
	xlab(paste0("Correlation with the term ", "\"", toi, "\""))
	dev.off()

	# BDP Correlation
	toi <- "bdp" # term of interest
	corlimit <- 0.15 # lower correlation bound limit.
	bdp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
	terms = row.names(findAssocs(tdm, toi, corlimit)))


	bdp_0.3$terms <- factor(bdp_0.3$terms, levels = bdp_0.3$terms)
	# Plot and save the image in png format
	png("bdp.png", width=9, height=9, units="in", res=500)
	ggplot(bdp_0.3, aes( y = terms ) ) +
	geom_point(aes(x = corr, size=corr), data = bdp_0.3) +
	scale_size(range = c(3, 15)) +
	ylab("")+
	xlab(paste0("Correlation with the term ", "\"", toi, "\""))
	dev.off()

	# CHP Correlation
	toi <- "chp" # term of interest
	corlimit <- 0.15 # lower correlation bound limit.
	chp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
	terms = row.names(findAssocs(tdm, toi, corlimit)))


	chp_0.3$terms <- factor(chp_0.3$terms, levels = chp_0.3$terms)

	# Plot and save the image in png format
	png("chp.png", width=9, height=9, units="in", res=500)
	ggplot(chp_0.3, aes( y = terms ) ) +
	geom_point(aes(x = corr, size=corr), data = chp_0.3) +
	scale_size(range = c(3, 15)) +
	ylab("")+
	xlab(paste0("Correlation with the term ", "\"", toi, "\""))
	dev.off()

	# HDP Correlation
	toi <- "hdp" # term of interest
	corlimit <- 0.15 # lower correlation bound limit.
	hdp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
	terms = row.names(findAssocs(tdm, toi, corlimit)))


	hdp_0.3$terms <- factor(hdp_0.3$terms, levels = hdp_0.3$terms)
	# Plot and save the image in png format
	png("hdp.png", width=9, height=9, units="in", res=500)
	ggplot(hdp_0.3, aes( y = terms ) ) +
	geom_point(aes(x = corr, size=corr), data = hdp_0.3) +
	scale_size(range = c(3, 15)) +
	ylab("")+
	xlab(paste0("Correlation with the term ", "\"", toi, "\""))
	dev.off()

	# MHP Correlation
	toi <- "mhp" # term of interest
	corlimit <- 0.15 # lower correlation bound limit.
	mhp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
	terms = row.names(findAssocs(tdm, toi, corlimit)))


	mhp_0.3$terms <- factor(mhp_0.3$terms, levels = mhp_0.3$terms)
	# Plot and save the image in png format
	png("mhp.png", width=9, height=9, units="in", res=500)
	ggplot(mhp_0.3, aes( y = terms ) ) +
	geom_point(aes(x = corr, size=corr), data = mhp_0.3) +
	scale_size(range = c(3, 15)) +
	ylab("")+
	xlab(paste0("Correlation with the term ", "\"", toi, "\""))
	dev.off()



	# WORDCLOUD
	# Define tdm as matrix
	m = as.matrix(tdm)

	# Get word counts in decreasing order
	word_freqs = sort(rowSums(m), decreasing=TRUE)

	# Create a data frame with words and their frequencies
	dm = data.frame(word=names(word_freqs), freq=word_freqs)

	# Color
	la_cont <- wes_palette(name = "Zissou", type = "continuous")

	# Plot and save the image in png format
	png("tbmm.png", width=9, height=9, units="in", res=500)

	wordcloud(dm$word, dm$freq, random.order=FALSE, min.freq = 2,scale=c(4,0.5), max.words = 100, colors=la_cont)

	dev.off()


	# Save workspace
	save.image(file = "tbmm.RData")