Created
June 19, 2018 19:45
-
-
Save adjam/d443f08ecafda412e224eda0ec467be7 to your computer and use it in GitHub Desktop.
Groovy script to take a batch of MARC files and split them into smaller files with a discrete size.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env groovy | |
@Grapes( | |
@Grab(group='org.marc4j', module='marc4j', version='2.8.3') | |
) | |
import org.marc4j.MarcStreamReader | |
import org.marc4j.MarcXmlReader | |
import org.marc4j.MarcXmlWriter | |
import org.marc4j.marc.Record | |
import org.marc4j.converter.impl.AnselToUnicode | |
// not everything in here should be taken for granted | |
class SplitWriter { | |
Long splitCount = 20000 | |
Integer fileCount = 0 | |
Long recordCount = 0 | |
OutputStream currentStream | |
def currentWriter = null | |
def currentFile = null | |
def fileNamePat = "output/marc-out-%d.xml" | |
Class<?> writerClass = MarcXmlWriter.class | |
def write(Record record) { | |
getCurrentWriter().write(record) | |
if (++recordCount % splitCount == 0 ) { | |
close() | |
} | |
} | |
def close() { | |
currentWriter.close() | |
currentStream.flush() | |
currentStream.close() | |
currentStream = null | |
currentWriter = null | |
currentFile = null | |
} | |
def getCurrentWriter() { | |
if (currentWriter == null ) { | |
currentFile = new File(String.format(fileNamePat, ++fileCount)) | |
if ( ! currentFile.parentFile.directory ) { | |
currentFile.parentFile.mkdirs() | |
} | |
currentStream = currentFile.newOutputStream() | |
def constructor = writerClass.getConstructor(OutputStream) | |
currentWriter = constructor.newInstance(currentStream) | |
currentWriter.setConverter(new AnselToUnicode()) | |
} | |
currentWriter | |
} | |
} | |
long total = 0 | |
def writer = new SplitWriter() | |
args.each { filename -> | |
new File(filename).withInputStream() { | |
input -> | |
def reader = filename.endsWith('.xml') ? new MarcXmlReader(input) :new MarcStreamReader(input) | |
try { | |
while ( reader.hasNext() ) { | |
Record rec = reader.next() | |
rec.leader.charCodingScheme = 'a'.charAt(0) | |
writer.write(rec) | |
} | |
} catch (org.marc4j.MarcException mx ) { | |
println("Unable to parse ${filename}") | |
mx.printStackTrace(System.out) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment