Skip to content

Instantly share code, notes, and snippets.

@debuos512
Last active June 7, 2021 11:44
Show Gist options
  • Save debuos512/6d65dd60faaae428e20273190c115257 to your computer and use it in GitHub Desktop.
Save debuos512/6d65dd60faaae428e20273190c115257 to your computer and use it in GitHub Desktop.
import java.util.concurrent.*
import java.util.concurrent.atomic.*
import groovyx.gpars.*
import groovy.json.*
import groovyx.gpars.GParsPool
def o = 0
def files = []
new File('/data2/pubmed_xml_baseline/ftp.ncbi.nlm.nih.gov/pubmed/baseline/').eachFile { f -> files << f }
GParsPool.withPool(10) {
files.eachParallel { f ->
if(!(f.getName() =~ 'xml$')) { return; }
o++
println f.getName() + " $o/${files.size()}"
def v = f.getName().tokenize('.')[0]
def i = 0
def outDir = new File('extract/'+v)
outDir.mkdir()
// we need to spin up a new parser each time, or it gets mad
def parser = new XmlSlurper()
parser.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false)
parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false)
def document = parser.parse(f)
document.depthFirst().findAll { it.name() == 'Abstract' }.each { abs ->
def aText = abs['AbstractText'].collect { it.text() }.join('\n')
new File(outDir, "${v}_abstract_${i}.txt").text = aText
i++
if(i % 5000 == 0) { println i }
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment