Last active
March 25, 2019 11:02
-
-
Save yrochat/655604c1092d53b4569c55922e0ee275 to your computer and use it in GitHub Desktop.
The code for article #20 on my blog (about FIFA)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rm(list=ls()) | |
library(readr) # importation | |
library(RTextTools) # classification de textes | |
library(textreuse) # plagiat | |
library(tm) # text mining | |
library(ggplot2) # visualisations | |
library(lsa) # latent semantic analysis | |
library(stringr) # manipulation de chaines de caractères | |
fifa <- read_csv2("fifa.csv") # le fichier à importer | |
fifa$label <- str_c(fifa$auteur, str_sub(fifa$annee, 3, 4), sep = "_") # on crée cette variable pour nommer les points du graphe | |
############# | |
## PLAGIAT ## | |
############# | |
fifa.corpus <- TextReuseCorpus(text = fifa$texte, tokenizer = tokenize_ngrams) | |
comparisons <- pairwise_compare(fifa.corpus, jaccard_similarity) | |
pairwise_candidates(comparisons)[order(pairwise_candidates(comparisons)$score, decreasing = TRUE),] | |
align_local(fifa$texte[18], fifa$texte[39]) | |
align_local(fifa$texte[30], fifa$texte[31]) | |
align_local(fifa$texte[7], fifa$texte[11]) | |
align_local(fifa$texte[11], fifa$texte[12]) | |
align_local(fifa$texte[32], fifa$texte[33]) | |
align_local(fifa$texte[7], fifa$texte[8]) | |
# "If the function reports that there were multiple optimal alignments, then it is likely that there is no strong match in the document." | |
# apparemment pas de plagiat dans ces textes | |
############## | |
## DISTANCE ## | |
############## | |
# tuto http://bodong.ch/blog/2013/03/11/analyze-text-similarity-in-r-latent-semantic-analysis-and-multidimentional-scaling/ | |
corpus <- Corpus(VectorSource(fifa$texte)) | |
corpus <- tm_map(corpus, content_transformer(tolower)) | |
corpus <- tm_map(corpus, content_transformer(function(x) removeWords(x, stopwords("french")))) | |
corpus <- tm_map(corpus, content_transformer(removePunctuation)) | |
corpus <- tm_map(corpus, content_transformer(removeNumbers)) | |
corpus <- tm_map(corpus, stemDocument, language = "french") | |
corpus # check corpus | |
############### | |
## EUCLIDIEN ## | |
############### | |
td.mat <- as.matrix(TermDocumentMatrix(corpus)) # rownames(td.mat)[which.max(nchar(rownames(td.mat)))] | |
dist.mat <- dist(t(as.matrix(td.mat))) | |
dist.mat # check distance matrix | |
dist.mat.print <- dist(t(as.matrix(td.mat)), diag = TRUE, upper = TRUE) | |
attr(dist.mat.print, "Labels") <- fifa$label | |
write.csv(as.matrix(dist.mat.print), "dist_eucl.csv") | |
fit <- cmdscale(dist.mat, eig = TRUE, k = 2) # min 14 dimensions | |
fifa$x <- fit$points[, 1] | |
fifa$y <- fit$points[, 2] | |
ggplot(fifa, aes(x, y, color = Journal)) + geom_point(size = 1) + geom_text(aes(label = label), size = 3, vjust = 0, nudge_y = .4, check_overlap = F, show.legend = FALSE) | |
ggsave("euclidien.png", width = 10) | |
############# | |
## BINAIRE ## | |
############# | |
td.mat <- as.matrix(TermDocumentMatrix(corpus)) # rownames(td.mat)[which.max(nchar(rownames(td.mat)))] | |
dist.mat <- dist(t(as.matrix(td.mat)), method = "binary") | |
dist.mat # check distance matrix | |
dist.mat.print <- dist(t(as.matrix(td.mat)), diag = TRUE, upper = TRUE) | |
attr(dist.mat.print, "Labels") <- fifa$label | |
write.csv(as.matrix(dist.mat.print), "dist_bin.csv") | |
fit <- cmdscale(dist.mat, eig = TRUE, k = 2) | |
fifa$x <- fit$points[, 1] | |
fifa$y <- fit$points[, 2] | |
ggplot(fifa, aes(x, y, color = Journal)) + geom_point(size = 1) + geom_text(aes(label = label), size = 3, vjust = 0, nudge_y = .004, check_overlap = T, show.legend = FALSE) | |
ggsave("binaire.png", width = 10) | |
######### | |
## LSA ## | |
######### | |
corpus <- Corpus(VectorSource(fifa$texte)) | |
corpus <- tm_map(corpus, content_transformer(tolower)) | |
corpus <- tm_map(corpus, content_transformer(removeNumbers)) | |
corpus # check corpus | |
td.mat <- as.matrix(TermDocumentMatrix(corpus)) | |
td.mat.lsa <- lw_bintf(td.mat) * gw_idf(td.mat) # weighting | |
lsaSpace <- lsa(td.mat.lsa) # create LSA space | |
dist.mat.lsa <- dist(t(as.textmatrix(lsaSpace))) # compute distance matrix | |
dist.mat.lsa # check distance matrix | |
dist.mat.print <- dist(t(as.matrix(dist.mat.lsa)), diag = TRUE, upper = TRUE) | |
attr(dist.mat.print, "Labels") <- fifa$label | |
write.csv(as.matrix(dist.mat.print), "dist_lsa.csv") | |
fit <- cmdscale(dist.mat.lsa, eig = TRUE, k = 2) # min 9 dimensions | |
fifa$x <- fit$points[, 1] | |
fifa$y <- fit$points[, 2] | |
ggplot(fifa, aes(x, y, color = Journal)) + geom_point(size = 1) + geom_text(aes(label = label), size = 3, vjust = 0, nudge_y = 1, check_overlap = T, show.legend = FALSE) | |
ggsave("lsa.png", width = 10) | |
################# | |
## OTHER GAMES ## | |
################# | |
other <- read_csv2("other.csv") | |
other$label <- str_c(other$auteur, str_sub(other$annee, 3, 4), sep = "_") | |
fifa2 <- rbind(fifa,other) | |
corpus <- Corpus(VectorSource(fifa2$texte)) | |
corpus <- tm_map(corpus, content_transformer(tolower)) | |
corpus <- tm_map(corpus, content_transformer(function(x) removeWords(x, stopwords("french")))) | |
corpus <- tm_map(corpus, content_transformer(removePunctuation)) | |
corpus <- tm_map(corpus, content_transformer(removeNumbers)) | |
corpus <- tm_map(corpus, stemDocument, language = "french") | |
corpus # check corpus | |
td.mat <- as.matrix(TermDocumentMatrix(corpus)) # rownames(td.mat)[which.max(nchar(rownames(td.mat)))] | |
dist.mat <- dist(t(as.matrix(td.mat))) | |
dist.mat # check distance matrix | |
fit <- cmdscale(dist.mat, eig = TRUE, k = 2) # min 14 dimensions | |
fifa2$x <- fit$points[, 1] | |
fifa2$y <- fit$points[, 2] | |
ggplot(fifa2, aes(x, y, color = Journal)) + geom_point(size = 1) + geom_text(aes(label = label), size = 3, vjust = 0, nudge_y = .4, check_overlap = F, show.legend = FALSE) | |
ggsave("lol.png", width = 10) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment