jsaraviadrago · August 6, 2019 19:01
diff --git a/Script_Text_mining_WC.R b/Script_Text_mining_WC.R


 library(tm)
 library(SnowballC)
 library(wordcloud)
 library(RColorBrewer)

 data <- readLines(file.choose())

 mydata <- Corpus(VectorSource(data))

 # convert to lower case
 mydata <- tm_map(mydata, content_transformer(tolower))

 # #remove ������ what would be emojis
 # mydata<-tm_map(mydata, content_transformer(gsub), pattern="\\W",replace=" ")
 # # remove URLs
 # removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
 # mydata <- tm_map(mydata, content_transformer(removeURL)
 # )

 # remove anything other than English letters or space
 removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
 mydata <- tm_map(mydata, content_transformer(removeNumPunct))

 changepunct <- function(x) chartr("áéíóú", "aeiou", x)
 mydata <- tm_map(mydata, content_transformer(changepunct))

 # remove stopwords
 mydata <- tm_map(mydata, removeWords, stopwords("spanish"))

 #Example:
 #u can create custom stop words using the code below.
 #myStopwords <- c(setdiff(stopwords('english'), c("r", "big")),"use", "see", "used", "via", "amp")
 #mydata <- tm_map(mydata, removeWords, myStopwords)

 #u can create custom stop words using the code below.
 myStopwords <- c(setdiff(stopwords('spanish'),
                         c("mas")))
 mydata <- tm_map(mydata, removeWords, myStopwords)

 # remove extra whitespace
 mydata <- tm_map(mydata, stripWhitespace)

 # Stemming in spanish

 # mydata <- tm_map(mydata, PlainTextDocument)  # needs to come before stemming
 # mydata <- tm_map(mydata, stemDocument,
 #                  "spanish")



 # Build a term-document matrix

 dtm <- TermDocumentMatrix(mydata)
 m <- as.matrix(dtm)
 v <- sort(rowSums(m),decreasing=TRUE)
 d <- data.frame(word = names(v),freq=v)
 head(d, 10)

 # Generate the wordcloud

 wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=300, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

 # Plot word frequencies

 barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
        col ="lightblue", main ="Palabras mas frecuentes",
        ylab = "Frecuencia de las palabras")

 # Explore frequent terms and associations

 findFreqTerms(dtm, lowfreq = 6)

 # You can analyze the association between frequent terms 
 # (i.e., terms which correlate) using findAssocs() function.
 # The R code below identifies which words are associated with “data”
 # in I have a dream speech 

 findAssocs(dtm, terms = "data", 
           corlimit = 0.5)


	library(tm)
	library(SnowballC)
	library(wordcloud)
	library(RColorBrewer)

	data <- readLines(file.choose())

	mydata <- Corpus(VectorSource(data))

	# convert to lower case
	mydata <- tm_map(mydata, content_transformer(tolower))

	# #remove �� what would be emojis
	# mydata<-tm_map(mydata, content_transformer(gsub), pattern="\\W",replace=" ")
	# # remove URLs
	# removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
	# mydata <- tm_map(mydata, content_transformer(removeURL)
	# )

	# remove anything other than English letters or space
	removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
	mydata <- tm_map(mydata, content_transformer(removeNumPunct))

	changepunct <- function(x) chartr("áéíóú", "aeiou", x)
	mydata <- tm_map(mydata, content_transformer(changepunct))

	# remove stopwords
	mydata <- tm_map(mydata, removeWords, stopwords("spanish"))

	#Example:
	#u can create custom stop words using the code below.
	#myStopwords <- c(setdiff(stopwords('english'), c("r", "big")),"use", "see", "used", "via", "amp")
	#mydata <- tm_map(mydata, removeWords, myStopwords)

	#u can create custom stop words using the code below.
	myStopwords <- c(setdiff(stopwords('spanish'),
	c("mas")))
	mydata <- tm_map(mydata, removeWords, myStopwords)

	# remove extra whitespace
	mydata <- tm_map(mydata, stripWhitespace)

	# Stemming in spanish

	# mydata <- tm_map(mydata, PlainTextDocument) # needs to come before stemming
	# mydata <- tm_map(mydata, stemDocument,
	# "spanish")



	# Build a term-document matrix

	dtm <- TermDocumentMatrix(mydata)
	m <- as.matrix(dtm)
	v <- sort(rowSums(m),decreasing=TRUE)
	d <- data.frame(word = names(v),freq=v)
	head(d, 10)

	# Generate the wordcloud

	wordcloud(words = d$word, freq = d$freq, min.freq = 1,
	max.words=300, random.order=FALSE, rot.per=0.35,
	colors=brewer.pal(8, "Dark2"))

	# Plot word frequencies

	barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
	col ="lightblue", main ="Palabras mas frecuentes",
	ylab = "Frecuencia de las palabras")

	# Explore frequent terms and associations

	findFreqTerms(dtm, lowfreq = 6)

	# You can analyze the association between frequent terms
	# (i.e., terms which correlate) using findAssocs() function.
	# The R code below identifies which words are associated with “data”
	# in I have a dream speech

	findAssocs(dtm, terms = "data",
	corlimit = 0.5)