-
-
Save juananpe/37b4232ea805be9e54465b4a76b8ecce to your computer and use it in GitHub Desktop.
The code for article #20 on my blog (about FIFA)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rm(list=ls()) | |
library(readr) # importation | |
library(RTextTools) # classification de textes | |
library(textreuse) # plagiat | |
library(tm) # text mining | |
library(ggplot2) # visualisations | |
library(lsa) # latent semantic analysis | |
library(stringr) # manipulation de chaines de caractères | |
fifa <- read_csv2("fifa.csv") # le fichier à importer | |
fifa$label <- str_c(fifa$auteur, str_sub(fifa$annee, 3, 4), sep = "_") # on crée cette variable pour nommer les points du graphe | |
############# | |
## PLAGIAT ## | |
############# | |
fifa.corpus <- TextReuseCorpus(text = fifa$texte, tokenizer = tokenize_ngrams) | |
comparisons <- pairwise_compare(fifa.corpus, jaccard_similarity) | |
pairwise_candidates(comparisons)[order(pairwise_candidates(comparisons)$score, decreasing = TRUE),] | |
align_local(fifa$texte[18], fifa$texte[39]) | |
align_local(fifa$texte[30], fifa$texte[31]) | |
align_local(fifa$texte[7], fifa$texte[11]) | |
align_local(fifa$texte[11], fifa$texte[12]) | |
align_local(fifa$texte[32], fifa$texte[33]) | |
align_local(fifa$texte[7], fifa$texte[8]) | |
# "If the function reports that there were multiple optimal alignments, then it is likely that there is no strong match in the document." | |
# apparemment pas de plagiat dans ces textes | |
############## | |
## DISTANCE ## | |
############## | |
# tuto http://bodong.ch/blog/2013/03/11/analyze-text-similarity-in-r-latent-semantic-analysis-and-multidimentional-scaling/ | |
corpus <- Corpus(VectorSource(fifa$texte)) | |
corpus <- tm_map(corpus, content_transformer(tolower)) | |
corpus <- tm_map(corpus, content_transformer(function(x) removeWords(x, stopwords("french")))) | |
corpus <- tm_map(corpus, content_transformer(removePunctuation)) | |
corpus <- tm_map(corpus, content_transformer(removeNumbers)) | |
corpus <- tm_map(corpus, stemDocument, language = "french") | |
corpus # check corpus | |
############### | |
## EUCLIDIEN ## | |
############### | |
td.mat <- as.matrix(TermDocumentMatrix(corpus)) # rownames(td.mat)[which.max(nchar(rownames(td.mat)))] | |
dist.mat <- dist(t(as.matrix(td.mat))) | |
dist.mat # check distance matrix | |
dist.mat.print <- dist(t(as.matrix(td.mat)), diag = TRUE, upper = TRUE) | |
attr(dist.mat.print, "Labels") <- fifa$label | |
write.csv(as.matrix(dist.mat.print), "dist_eucl.csv") | |
fit <- cmdscale(dist.mat, eig = TRUE, k = 2) # min 14 dimensions | |
fifa$x <- fit$points[, 1] | |
fifa$y <- fit$points[, 2] | |
ggplot(fifa, aes(x, y, color = Journal)) + geom_point(size = 1) + geom_text(aes(label = label), size = 3, vjust = 0, nudge_y = .4, check_overlap = F, show.legend = FALSE) | |
ggsave("euclidien.png", width = 10) | |
############# | |
## BINAIRE ## | |
############# | |
td.mat <- as.matrix(TermDocumentMatrix(corpus)) # rownames(td.mat)[which.max(nchar(rownames(td.mat)))] | |
dist.mat <- dist(t(as.matrix(td.mat)), method = "binary") | |
dist.mat # check distance matrix | |
dist.mat.print <- dist(t(as.matrix(td.mat)), diag = TRUE, upper = TRUE) | |
attr(dist.mat.print, "Labels") <- fifa$label | |
write.csv(as.matrix(dist.mat.print), "dist_bin.csv") | |
fit <- cmdscale(dist.mat, eig = TRUE, k = 2) | |
fifa$x <- fit$points[, 1] | |
fifa$y <- fit$points[, 2] | |
ggplot(fifa, aes(x, y, color = Journal)) + geom_point(size = 1) + geom_text(aes(label = label), size = 3, vjust = 0, nudge_y = .004, check_overlap = T, show.legend = FALSE) | |
ggsave("binaire.png", width = 10) | |
######### | |
## LSA ## | |
######### | |
corpus <- Corpus(VectorSource(fifa$texte)) | |
corpus <- tm_map(corpus, content_transformer(tolower)) | |
corpus <- tm_map(corpus, content_transformer(removeNumbers)) | |
corpus # check corpus | |
td.mat <- as.matrix(TermDocumentMatrix(corpus)) | |
td.mat.lsa <- lw_bintf(td.mat) * gw_idf(td.mat) # weighting | |
lsaSpace <- lsa(td.mat.lsa) # create LSA space | |
dist.mat.lsa <- dist(t(as.textmatrix(lsaSpace))) # compute distance matrix | |
dist.mat.lsa # check distance matrix | |
dist.mat.print <- dist(t(as.matrix(dist.mat.lsa)), diag = TRUE, upper = TRUE) | |
attr(dist.mat.print, "Labels") <- fifa$label | |
write.csv(as.matrix(dist.mat.print), "dist_lsa.csv") | |
fit <- cmdscale(dist.mat.lsa, eig = TRUE, k = 2) # min 9 dimensions | |
fifa$x <- fit$points[, 1] | |
fifa$y <- fit$points[, 2] | |
ggplot(fifa, aes(x, y, color = Journal)) + geom_point(size = 1) + geom_text(aes(label = label), size = 3, vjust = 0, nudge_y = 1, check_overlap = T, show.legend = FALSE) | |
ggsave("lsa.png", width = 10) | |
################# | |
## OTHER GAMES ## | |
################# | |
other <- read_csv2("other.csv") | |
other$label <- str_c(other$auteur, str_sub(other$annee, 3, 4), sep = "_") | |
fifa2 <- rbind(fifa,other) | |
corpus <- Corpus(VectorSource(fifa2$texte)) | |
corpus <- tm_map(corpus, content_transformer(tolower)) | |
corpus <- tm_map(corpus, content_transformer(function(x) removeWords(x, stopwords("french")))) | |
corpus <- tm_map(corpus, content_transformer(removePunctuation)) | |
corpus <- tm_map(corpus, content_transformer(removeNumbers)) | |
corpus <- tm_map(corpus, stemDocument, language = "french") | |
corpus # check corpus | |
td.mat <- as.matrix(TermDocumentMatrix(corpus)) # rownames(td.mat)[which.max(nchar(rownames(td.mat)))] | |
dist.mat <- dist(t(as.matrix(td.mat))) | |
dist.mat # check distance matrix | |
fit <- cmdscale(dist.mat, eig = TRUE, k = 2) # min 14 dimensions | |
fifa2$x <- fit$points[, 1] | |
fifa2$y <- fit$points[, 2] | |
ggplot(fifa2, aes(x, y, color = Journal)) + geom_point(size = 1) + geom_text(aes(label = label), size = 3, vjust = 0, nudge_y = .4, check_overlap = F, show.legend = FALSE) | |
ggsave("lol.png", width = 10) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment