Skip to content

Instantly share code, notes, and snippets.

@adjam
Created June 19, 2018 19:45
Show Gist options
  • Save adjam/d443f08ecafda412e224eda0ec467be7 to your computer and use it in GitHub Desktop.
Save adjam/d443f08ecafda412e224eda0ec467be7 to your computer and use it in GitHub Desktop.
Groovy script to take a batch of MARC files and split them into smaller files with a discrete size.
#!/usr/bin/env groovy
@Grapes(
@Grab(group='org.marc4j', module='marc4j', version='2.8.3')
)
import org.marc4j.MarcStreamReader
import org.marc4j.MarcXmlReader
import org.marc4j.MarcXmlWriter
import org.marc4j.marc.Record
import org.marc4j.converter.impl.AnselToUnicode
// not everything in here should be taken for granted
class SplitWriter {
Long splitCount = 20000
Integer fileCount = 0
Long recordCount = 0
OutputStream currentStream
def currentWriter = null
def currentFile = null
def fileNamePat = "output/marc-out-%d.xml"
Class<?> writerClass = MarcXmlWriter.class
def write(Record record) {
getCurrentWriter().write(record)
if (++recordCount % splitCount == 0 ) {
close()
}
}
def close() {
currentWriter.close()
currentStream.flush()
currentStream.close()
currentStream = null
currentWriter = null
currentFile = null
}
def getCurrentWriter() {
if (currentWriter == null ) {
currentFile = new File(String.format(fileNamePat, ++fileCount))
if ( ! currentFile.parentFile.directory ) {
currentFile.parentFile.mkdirs()
}
currentStream = currentFile.newOutputStream()
def constructor = writerClass.getConstructor(OutputStream)
currentWriter = constructor.newInstance(currentStream)
currentWriter.setConverter(new AnselToUnicode())
}
currentWriter
}
}
long total = 0
def writer = new SplitWriter()
args.each { filename ->
new File(filename).withInputStream() {
input ->
def reader = filename.endsWith('.xml') ? new MarcXmlReader(input) :new MarcStreamReader(input)
try {
while ( reader.hasNext() ) {
Record rec = reader.next()
rec.leader.charCodingScheme = 'a'.charAt(0)
writer.write(rec)
}
} catch (org.marc4j.MarcException mx ) {
println("Unable to parse ${filename}")
mx.printStackTrace(System.out)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment