Skip to content

Instantly share code, notes, and snippets.

@yrochat
Last active March 25, 2019 11:02
Show Gist options
  • Save yrochat/655604c1092d53b4569c55922e0ee275 to your computer and use it in GitHub Desktop.
Save yrochat/655604c1092d53b4569c55922e0ee275 to your computer and use it in GitHub Desktop.
The code for article #20 on my blog (about FIFA)
rm(list=ls())
library(readr) # importation
library(RTextTools) # classification de textes
library(textreuse) # plagiat
library(tm) # text mining
library(ggplot2) # visualisations
library(lsa) # latent semantic analysis
library(stringr) # manipulation de chaines de caractères
fifa <- read_csv2("fifa.csv") # le fichier à importer
fifa$label <- str_c(fifa$auteur, str_sub(fifa$annee, 3, 4), sep = "_") # on crée cette variable pour nommer les points du graphe
#############
## PLAGIAT ##
#############
fifa.corpus <- TextReuseCorpus(text = fifa$texte, tokenizer = tokenize_ngrams)
comparisons <- pairwise_compare(fifa.corpus, jaccard_similarity)
pairwise_candidates(comparisons)[order(pairwise_candidates(comparisons)$score, decreasing = TRUE),]
align_local(fifa$texte[18], fifa$texte[39])
align_local(fifa$texte[30], fifa$texte[31])
align_local(fifa$texte[7], fifa$texte[11])
align_local(fifa$texte[11], fifa$texte[12])
align_local(fifa$texte[32], fifa$texte[33])
align_local(fifa$texte[7], fifa$texte[8])
# "If the function reports that there were multiple optimal alignments, then it is likely that there is no strong match in the document."
# apparemment pas de plagiat dans ces textes
##############
## DISTANCE ##
##############
# tuto http://bodong.ch/blog/2013/03/11/analyze-text-similarity-in-r-latent-semantic-analysis-and-multidimentional-scaling/
corpus <- Corpus(VectorSource(fifa$texte))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, content_transformer(function(x) removeWords(x, stopwords("french"))))
corpus <- tm_map(corpus, content_transformer(removePunctuation))
corpus <- tm_map(corpus, content_transformer(removeNumbers))
corpus <- tm_map(corpus, stemDocument, language = "french")
corpus # check corpus
###############
## EUCLIDIEN ##
###############
td.mat <- as.matrix(TermDocumentMatrix(corpus)) # rownames(td.mat)[which.max(nchar(rownames(td.mat)))]
dist.mat <- dist(t(as.matrix(td.mat)))
dist.mat # check distance matrix
dist.mat.print <- dist(t(as.matrix(td.mat)), diag = TRUE, upper = TRUE)
attr(dist.mat.print, "Labels") <- fifa$label
write.csv(as.matrix(dist.mat.print), "dist_eucl.csv")
fit <- cmdscale(dist.mat, eig = TRUE, k = 2) # min 14 dimensions
fifa$x <- fit$points[, 1]
fifa$y <- fit$points[, 2]
ggplot(fifa, aes(x, y, color = Journal)) + geom_point(size = 1) + geom_text(aes(label = label), size = 3, vjust = 0, nudge_y = .4, check_overlap = F, show.legend = FALSE)
ggsave("euclidien.png", width = 10)
#############
## BINAIRE ##
#############
td.mat <- as.matrix(TermDocumentMatrix(corpus)) # rownames(td.mat)[which.max(nchar(rownames(td.mat)))]
dist.mat <- dist(t(as.matrix(td.mat)), method = "binary")
dist.mat # check distance matrix
dist.mat.print <- dist(t(as.matrix(td.mat)), diag = TRUE, upper = TRUE)
attr(dist.mat.print, "Labels") <- fifa$label
write.csv(as.matrix(dist.mat.print), "dist_bin.csv")
fit <- cmdscale(dist.mat, eig = TRUE, k = 2)
fifa$x <- fit$points[, 1]
fifa$y <- fit$points[, 2]
ggplot(fifa, aes(x, y, color = Journal)) + geom_point(size = 1) + geom_text(aes(label = label), size = 3, vjust = 0, nudge_y = .004, check_overlap = T, show.legend = FALSE)
ggsave("binaire.png", width = 10)
#########
## LSA ##
#########
corpus <- Corpus(VectorSource(fifa$texte))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, content_transformer(removeNumbers))
corpus # check corpus
td.mat <- as.matrix(TermDocumentMatrix(corpus))
td.mat.lsa <- lw_bintf(td.mat) * gw_idf(td.mat) # weighting
lsaSpace <- lsa(td.mat.lsa) # create LSA space
dist.mat.lsa <- dist(t(as.textmatrix(lsaSpace))) # compute distance matrix
dist.mat.lsa # check distance matrix
dist.mat.print <- dist(t(as.matrix(dist.mat.lsa)), diag = TRUE, upper = TRUE)
attr(dist.mat.print, "Labels") <- fifa$label
write.csv(as.matrix(dist.mat.print), "dist_lsa.csv")
fit <- cmdscale(dist.mat.lsa, eig = TRUE, k = 2) # min 9 dimensions
fifa$x <- fit$points[, 1]
fifa$y <- fit$points[, 2]
ggplot(fifa, aes(x, y, color = Journal)) + geom_point(size = 1) + geom_text(aes(label = label), size = 3, vjust = 0, nudge_y = 1, check_overlap = T, show.legend = FALSE)
ggsave("lsa.png", width = 10)
#################
## OTHER GAMES ##
#################
other <- read_csv2("other.csv")
other$label <- str_c(other$auteur, str_sub(other$annee, 3, 4), sep = "_")
fifa2 <- rbind(fifa,other)
corpus <- Corpus(VectorSource(fifa2$texte))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, content_transformer(function(x) removeWords(x, stopwords("french"))))
corpus <- tm_map(corpus, content_transformer(removePunctuation))
corpus <- tm_map(corpus, content_transformer(removeNumbers))
corpus <- tm_map(corpus, stemDocument, language = "french")
corpus # check corpus
td.mat <- as.matrix(TermDocumentMatrix(corpus)) # rownames(td.mat)[which.max(nchar(rownames(td.mat)))]
dist.mat <- dist(t(as.matrix(td.mat)))
dist.mat # check distance matrix
fit <- cmdscale(dist.mat, eig = TRUE, k = 2) # min 14 dimensions
fifa2$x <- fit$points[, 1]
fifa2$y <- fit$points[, 2]
ggplot(fifa2, aes(x, y, color = Journal)) + geom_point(size = 1) + geom_text(aes(label = label), size = 3, vjust = 0, nudge_y = .4, check_overlap = F, show.legend = FALSE)
ggsave("lol.png", width = 10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment