Created
December 1, 2013 16:54
-
-
Save kaimst/7736686 to your computer and use it in GitHub Desktop.
A Groovy script that extracts metadata from files using Apache Tika. Works recursively on a file hierarchy and writes all found metadata into a single xml file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Copyright 2013 Kai Sternad | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
import groovy.xml.StreamingMarkupBuilder | |
import org.apache.tika.sax.BodyContentHandler | |
import org.apache.tika.config.TikaConfig | |
import org.apache.tika.metadata.Metadata | |
import org.apache.tika.parser.AutoDetectParser | |
import org.apache.tika.parser.ParseContext | |
import org.apache.tika.parser.Parser | |
import org.apache.tika.metadata.TikaCoreProperties; | |
import static org.apache.tika.metadata.TikaCoreProperties.*; | |
import org.apache.commons.io.FilenameUtils | |
import groovy.util.logging.Slf4j | |
import java.security.MessageDigest | |
/** | |
* This script extracts some of the Dublin Core metadata fields (and some file metadata) from files and writes them to an xml structure on disk. | |
* It recursively traverses the file system, starting from the provided directory.<br/> | |
* Any file having a suffix defined in <code>ALLOWED_FILES</code> will be processed.<br/> | |
* The metadata fields to be extracted are defined in <code>METADATA_FIELDS</code><br/> | |
* | |
* | |
* This script depends on a working installation of Groovy / Grape and must be invoked from the command line with: | |
* <pre>groovy TikaService.groovy <root directory></pre> | |
* | |
* | |
* A file named "metadata.xml" is written into the directory the script is executed in.<br/> | |
* It has the following structure: | |
* <pre> | |
* {@code | |
* <?xml version="1.0" encoding="UTF-8"?> | |
* <root xmlns:dc="http://purl.org/dc/elements/1.1/" | |
* xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0" | |
* xmlns:dcterms="http://purl.org/dc/terms/"> | |
* <parsedFile> | |
* <fileMetadata name="helloworld.pdf"> | |
* <Content-Type>application/pdf</Content-Type> | |
* <dc:title>Hello, World</dc:title> | |
* <dcterms:created>2009-06-25T12:42:58Z</dcterms:created> | |
* <dcterms:modified>2011-06-25T12:43:16Z</dcterms:modified> | |
* <file-md5>a8158833d3b1d341e705a4c268a7264c</file-md5> | |
* ... | |
* </fileMetadata> | |
* </root> | |
* } | |
* </pre> | |
* @author Kai Sternad | |
* @date 2013/10/20 | |
*/ | |
@Grab(group='org.apache.tika', module='tika-parsers', version='1.4') | |
@Grab(group='commons-io', module='commons-io', version='2.4') | |
@Grab(group='ch.qos.logback', module='logback-classic', version='1.0.13') | |
@Slf4j | |
class TikaService { | |
public static void main(def args){ | |
if (args.length != 1){ | |
println "please supply the root directory for the metadata extraction" | |
System.exit(-1) | |
} | |
def root = new File(args[0]) | |
if (! root.isDirectory()){ | |
println "must be a directory" | |
System.exit(-1) | |
} | |
List<File> files = new RecursiveFileFinder().findSuitableFiles(root); | |
def mde = new MetadataExtractor() | |
File metadataFile = new File("metadata.xml"); | |
new TikaService().metaToXml(files, metadataFile, mde) | |
} | |
public void metaToXml(List files, File outfile, MetadataExtractor mde){ | |
def fieldList = [] | |
log.info("number of allowed files: " + files.size()) | |
def builder = new StreamingMarkupBuilder() | |
new OutputStreamWriter(new FileOutputStream(outfile),'utf-8') << builder.bind{ | |
mkp.xmlDeclaration(version: "1.0", encoding: "utf-8") | |
mkp.declareNamespace("dc": "http://purl.org/dc/elements/1.1/") | |
mkp.declareNamespace("dcterms": "http://purl.org/dc/terms/") | |
mkp.declareNamespace("meta": "urn:oasis:names:tc:opendocument:xmlns:meta:1.0") | |
root(){ | |
files.each{ file -> | |
log.info("about to parse: ${file.name}") | |
Map metaDataFields = mde.getMetadataForFile(file); | |
parsedFile(){ | |
fileMetadata(name:file.name){ | |
metaDataFields.each { "$it.key"(it.value) } | |
} | |
} | |
} | |
} | |
} | |
} | |
static class RecursiveFileFinder{ | |
private static final List ALLOWED_FILES = [ | |
"mp4", | |
"ai", | |
"asf", | |
"gif", | |
"info", | |
"txt", | |
"xlsm", | |
"tif", | |
"dmg", | |
"pps", | |
"xml", | |
"sample", | |
"dot", | |
"eps", | |
"mp3", | |
"docx", | |
"xls", | |
"jpg", | |
"zip", | |
"ppt", | |
"pdf", | |
"doc" | |
] | |
public List<File> findSuitableFiles(File rootDir){ | |
def files = [] | |
rootDir.eachFileRecurse{ file -> | |
// No directories, only files | |
if (!file.isDirectory()) { | |
// Only allowed suffixes | |
def suffix = FilenameUtils.getExtension(file.name) | |
if (ALLOWED_FILES.contains(suffix.toLowerCase())){ | |
files << file | |
} | |
} | |
} | |
return files | |
} | |
} | |
@Slf4j | |
static class MetadataExtractor{ | |
/** | |
* List of Metadata fields to be extracted. | |
* Change these if you would like to extract different fields | |
*/ | |
private static final List METADATA_FIELDS = [ | |
Metadata.CONTENT_TYPE, | |
TITLE, | |
CREATOR, | |
MODIFIER, | |
RIGHTS, | |
CREATED, | |
MODIFIED, | |
COVERAGE, | |
KEYWORDS, | |
DESCRIPTION | |
] | |
public Map getMetadataForFile(File file){ | |
Metadata metadata = parseFile(file) | |
String md5 = generateMD5(file) | |
Map extractedMetadata = extractMetadata(metadata) | |
extractedMetadata << ["file-md5" : md5] | |
return extractedMetadata | |
} | |
private Map extractMetadata(Metadata tikaMeta){ | |
def nonEmptyFields = [:] | |
METADATA_FIELDS.each{ field -> | |
def extractedMetadataField = tikaMeta.get(field); | |
if (extractedMetadataField){ | |
String key = field.class.equals(String.class) ? field : field.name | |
nonEmptyFields << ["$key":extractedMetadataField] | |
} | |
} | |
return nonEmptyFields; | |
} | |
private String generateMD5(File f) { | |
MessageDigest digest = MessageDigest.getInstance("MD5") | |
digest.update(f.getBytes()); | |
new BigInteger(1, digest.digest()).toString(16).padLeft(32, '0') | |
} | |
private Metadata parseFile(File file){ | |
FileInputStream stream = new FileInputStream(file); | |
TikaConfig tikaConfig = new TikaConfig() | |
Metadata tikaMeta = new Metadata() | |
BodyContentHandler handler = new BodyContentHandler(); | |
Parser parser = new AutoDetectParser(tikaConfig) | |
try { | |
parser.parse(stream, handler, tikaMeta) | |
log.debug("parsed file {$file.absolutePath}") | |
} catch (Exception e) { | |
log.error("Failed to parse file ${file.absolutePath} ${e}") | |
} | |
return tikaMeta | |
} | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment