Created
May 30, 2017 16:11
-
-
Save ettorerizza/e860249572212fdb6c83167994eaffc4 to your computer and use it in GitHub Desktop.
Script R pour parser les 26 000 XML/TEI du corpus européen JRC-Acquis et leur ajouter leurs descripteurs eurovoc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(XML) | |
library(dplyr) | |
library(stringr) | |
library(readr) | |
library(readxl) | |
library(tidyr) | |
#liste des fichiers XML du corpus JRC Acquis version anglaise (http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/jrc-en.tgz) | |
liste <- | |
list.files( | |
path = "C:/Users/ettor/Desktop/Eurovoc/JRC Acqis Corpus/jrc-en/en", | |
recursive = TRUE, | |
ignore.case = FALSE, | |
include.dirs = FALSE, | |
full.names = TRUE | |
) | |
#fonction pour parser les TEI | |
parseTei <- function(fichier, path, xmlelement) { | |
text_parsed <- tryCatch({ | |
doc <- xmlParse(fichier, trim = FALSE) | |
text <- unlist(xpathSApply(doc, path, xmlelement)) | |
text <- gsub("%quot%", '"', text) | |
}, | |
error = function(cond) { | |
message(paste("file does not seem to exist:", fichier)) | |
message("Here's the original error message:") | |
message(cond) | |
# Choose a return value in case of error | |
return(NA) | |
}, | |
warning = function(cond) { | |
message(paste("file caused a warning:", fichier)) | |
message("Here's the original warning message:") | |
message(cond) | |
return(NULL) | |
}, | |
finally = { | |
message(paste("Processed file:", fichier)) | |
}) | |
return(c(fichier, text_parsed)) | |
} | |
#récupération des titres, body and filename | |
body <- | |
lapply(liste, | |
parseTei, | |
path = '//body|/TEI.2/teiHeader/fileDesc/titleStmt/title[1]', | |
xmlelement = xmlValue) | |
#list to dataframe | |
library(plyr) | |
dfrm = ldply(body, rbind) | |
colnames(dfrm) <- c("file", "doc", "body") | |
#ajout d'une colonne file_id (oublié de l'extraire des XML...) | |
dfrm <- | |
dfrm %>% | |
mutate(file_id = str_extract(doc, "\\S+\\d+\\S+")) | |
#fichier texte contenant les codes eurovocs (http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/jrc-acquis-eurovoc-descriptors.txt) | |
jrc_eurovoc <- | |
read_delim( | |
"C:/Users/ettor/Desktop/Eurovoc/JRC Acqis Corpus/jrc-acquis-eurovoc-descriptors.txt", | |
"\t", | |
escape_double = FALSE, | |
col_names = c("file_id", "X2", "eurovocs"), | |
col_types = cols(X2 = col_skip()), | |
trim_ws = TRUE | |
) | |
jrc_eurovocs_tidy <- | |
jrc_eurovoc %>% | |
tidyr::separate_rows(eurovocs, sep = " ") | |
#fichier contenant le label des codes eurovoc en anglais (http://eurovoc.europa.eu/drupal/?q=fr/download/list_pt&cl=en) | |
listPt <- | |
read_excel("C:/Users/ettor/Desktop/Eurovoc/listPt EurovocEN.xls") | |
#jointure entre eurovoc_tidy et la liste des labels | |
jrc_eurovocs_tidy <- | |
jrc_eurovocs_tidy %>% | |
left_join(listPt, by = c("eurovocs" = "ID")) | |
View(jrc_eurovocs_tidy) | |
#concatenation des lables dans une seule cellule, pour plus de lisibilité | |
library(data.table) | |
final_concat <- | |
setDT(jrc_eurovocs_tidy)[, .(concat_eurovocs = paste(EN, collapse = " || ")), by = .(file_id)] | |
#on réunit le tout | |
final_merge <- | |
dfrm %>% | |
left_join(final_concat) %>% | |
na.omit() | |
data.table::fwrite(final_merge,"final_merge.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment