Last active
April 24, 2019 16:33
-
-
Save dginev/ac5c282fbb5aa3fbd99b67029c638835 to your computer and use it in GitHub Desktop.
Extracting arXiv category metadata from OAI_PMHv2.0 xml harvest
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//! Convert arXiv's OAI harvested XML files into a lookup table for classification labels | |
// Step 0. Prerequisite: download all needed arXiv metadata via OAI, e.g. | |
//``` | |
// $ pip install git+http://github.com/bloomonkey/oai-harvest.git#egg=oaiharvest | |
// $ mkdir metadata/arxiv; cd metadata/arxiv | |
// $ oai-reg add arxiv http://export.arxiv.org/oai2?verb=Identify | |
// $ oai-harvest arxiv --until 2018-09-09 | |
//``` | |
// endpoint documentation at: https://arxiv.org/help/oa | |
use jwalk::WalkDir; | |
use libxml::parser::Parser; | |
use libxml::xpath::Context; | |
use rayon::prelude::*; | |
use serde_json; | |
use std::collections::{HashMap, HashSet}; | |
use std::env; | |
use std::error::Error; | |
use std::fs::File; | |
use std::sync::{Arc, Mutex}; | |
use std::time::SystemTime; | |
fn main() -> Result<(), Box<Error>> { | |
let start = SystemTime::now(); | |
// Read input arguments | |
let mut input_args = env::args(); | |
let _ = input_args.next(); // skip process name | |
let metadata_path = match input_args.next() { | |
Some(path) => path, | |
None => "../../metadata/arxiv/".to_string(), | |
}; | |
let labels_filepath = match input_args.next() { | |
Some(path) => path, | |
None => "categories-arXMLiv-08-2018.json".to_string(), | |
}; | |
// Extract a dataset (JSON?) of relevant metadata for a given NLP task. | |
// here, arXiv subject categories | |
let catalog: HashMap<String, Vec<String>> = HashMap::new(); | |
let catalog_arc = Arc::new(Mutex::new(catalog)); | |
WalkDir::new(metadata_path) | |
.num_threads(rayon::current_num_threads()) | |
.sort(true) | |
.into_iter() | |
.filter_map(|each| { | |
if let Ok(entry) = each { | |
let file_name = entry.file_name.to_str().unwrap_or(""); | |
if file_name.ends_with(".xml") { | |
let path = entry.path().to_str().unwrap_or("").to_owned(); | |
if !path.is_empty() { | |
return Some(path); | |
} | |
} | |
} | |
// all other cases | |
None | |
}) | |
.enumerate() | |
.par_bridge() | |
.for_each(|each| { | |
let (index, path) = each; | |
if index % 10000 == 0 { | |
println!("at document {:?}", index); | |
} | |
let parser = Parser::default(); | |
let doc = parser.parse_file(&path).unwrap(); | |
let mut context = Context::new(&doc).unwrap(); | |
let id = context.findvalue("/*/*[local-name()='id']", None).unwrap(); | |
let category = context | |
.findvalue("/*/*[local-name()='categories']", None) | |
.unwrap(); | |
let mut categories: HashSet<String> = HashSet::new(); | |
for cat in category.split(' ') { | |
let dotparts: Vec<&str> = cat.split('.').collect(); | |
if dotparts.len() > 1 { | |
// also record the lead-in category | |
categories.insert(dotparts[0].to_lowercase().to_string()); | |
// and the entire category | |
categories.insert(cat.to_lowercase().to_string()); | |
} else { | |
// just a single category to record | |
categories.insert(cat.to_lowercase().to_string()); | |
} | |
} | |
let mut categories_vec: Vec<String> = categories.drain().collect(); | |
categories_vec.sort(); | |
let thread_arc = catalog_arc.clone(); | |
let mut catalog_lock = thread_arc.lock().unwrap(); | |
catalog_lock.insert(id, categories_vec); | |
}); | |
// serialize to json | |
let file = File::create(labels_filepath)?; | |
let catalog_lock = catalog_arc.lock().unwrap(); | |
serde_json::to_writer(file, &*catalog_lock)?; | |
let duration_sec = SystemTime::now().duration_since(start).unwrap().as_secs(); | |
println!("-- metadata packer took {:?} seconds.", duration_sec); | |
Ok(()) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment