Created
October 16, 2023 23:16
-
-
Save NeutralKaon/c7486e71eff11f364952859d8d67356b to your computer and use it in GitHub Desktop.
Extract EndNote citations from a word document
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(xml2) | |
library(magrittr) | |
library(RCurl) | |
# This is a clean room reverse-engineered for compatibility and interoperability crude-yet-effective extractor for embedded citations created by end note in a word document. This is useful if you wish to e.g. rewrite the word document in LaTeX and import the same citations en block (via xml). | |
# EC 2009/24/EC states "The person having a right to use a copy of a computer program shall be entitled, without the authorisation of the rightholder, to observe, study or test the functioning of the program in order to determine the ideas and principles which underlie any element of the program if he does so while performing any of the acts of loading, displaying, running, transmitting or storing the program which he is entitled to do." | |
# Don't sue me! | |
# So, to extract references from a word document sent to you: | |
#docx <- yourdocx | |
#unzip(docx,files="word/document.xml",junkpaths=T); #Or do it manually | |
doc<-read_xml("document.xml") | |
refs<-xml_find_all(doc,"//w:fldData") | |
refs_text<-sapply(refs, xml_text) | |
decode_me <- c() | |
#Loop over, de-b64 each dict | |
for (i in 1:length(refs_text)){ | |
refs_db64 <- base64Decode(gsub("[\r\n]", "", refs_text[i])) | |
decode_me %<>% c(., refs_db64) | |
} | |
#Extract embedded b64'd citation information | |
decode_me <- unique(decode_me) | |
refs_xml=c() | |
for(i in 1:length(decode_me)){ | |
refs_xml %<>% c(., read_xml(decode_me[i]) %>% xml_find_all(., "//Cite/record") %>% as.character %>% unlist()) | |
} | |
refs_xmlf <- paste0('<?xml version="1.0" encoding="UTF-8" ?><xml><records>',paste0(refs_xml,collapse=""),'</records></xml>') %>% | |
read_xml %>% write_xml(.,file="extracted_refs.xml") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment