Skip to content

Instantly share code, notes, and snippets.

@diamonaj
Created January 30, 2023 23:34
Show Gist options
  • Save diamonaj/dcbf093b4d4a2ae921f83ab8a2bea134 to your computer and use it in GitHub Desktop.
Save diamonaj/dcbf093b4d4a2ae921f83ab8a2bea134 to your computer and use it in GitHub Desktop.
# assuming you have downloaded the data (Data1.csv) correctly,
# as discussed here: https://piazza.com/class/l7oq25mqbrz1nd/post/110
# you may need to change the file location in quotes below, to suit where your file is
apple <- read.csv("~/Documents/Data1.csv", stringsAsFactors = F, encoding="UTF-8")
str(apple)
library(tm)
corpus <- iconv(apple$text, "ASCII", "UTF-8")
corpus <- Corpus(VectorSource(corpus))
inspect(corpus[1:15])
corpus <- tm_map(corpus, tolower)
inspect(corpus[1:15])
corpus <- tm_map(corpus, removePunctuation)
inspect(corpus[1:15])
corpus <- tm_map(corpus, removeNumbers)
inspect(corpus[1:15])
cleanset <- tm_map(corpus, removeWords, stopwords('english'))
inspect(cleanset[1:15])
removeURL <- function(x) gsub('http[[:alnum:]]*', '', x)
cleanset <- tm_map(cleanset, content_transformer(removeURL))
inspect(cleanset[1:15])
cleanset <- tm_map(cleanset, removeWords, c('aapl', 'apple'))
cleanset <- tm_map(cleanset, gsub,
pattern = 'stocks',
replacement = 'stock')
cleanset <- tm_map(cleanset, stemDocument)
cleanset <- tm_map(cleanset, stripWhitespace)
inspect(cleanset[1:7])
tdm <- TermDocumentMatrix(cleanset)
tdm <- as.matrix(tdm)
tdm[1:10, 1:20]
w <- rowSums(tdm)
w <- subset(w, w>=25)
barplot(w,
las = 2,
col = rainbow(50))
library(wordcloud)
w <- sort(rowSums(tdm), decreasing = TRUE)
set.seed(222)
wordcloud(words = names(w),
freq = w,
max.words = 150,
random.order = F,
min.freq = 5,
colors = brewer.pal(8, 'Dark2'),
scale = c(5, 0.3),
rot.per = 0.7)
install.packages("syuzhet")
library(syuzhet)
# you may need to change the file location in quotes below, to suit where your file is
apple <- read.csv("~/Documents/Data1.csv", stringsAsFactors = F, encoding="UTF-8")
texts <- iconv(apple$text, "ASCII", "UTF-8", sub="byte")
raw_data$textCol<- iconv(raw_data$textCol, "ASCII", "UTF-8", sub="byte")
s <- get_nrc_sentiment(texts)
head(s)
barplot(colSums(s),
las = 2,
col = rainbow(10),
ylab = 'Count',
main = 'Sentiment Scores Tweets')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment