Last active
August 29, 2015 14:17
-
-
Save JGVerdugo/37bfc77286992cfadfa9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# Runs in Pyhton 3 | |
# Processes MSWord files with Tika. Zero exception or error control, use at own risk. | |
# Assumes lowercase .doc and .docx extensions. | |
# Output is written to new files with corresponding file exts in the same dir. | |
import os | |
import sys | |
import subprocess | |
import glob | |
# Usage: python convertdoc.py [dir] [meta/txt/html/xml] | |
curDir = "." | |
tika = "/home/pepe/code/tika-1.7/tika-app-1.7.jar" | |
mode = "--metadata" # Default conversion mode. Use "txt", "html" | |
# and "xml" on the command line for the others | |
def newName(name, mode): | |
newName = name[:-3] | |
if mode == "--metadata": | |
newName += "meta" | |
elif mode == "--text": | |
newName += "txt" | |
elif mode == "--html": | |
newName += "html" | |
elif mode == "--xml": | |
newName += "xml" | |
else: | |
newName += "meta" | |
return newName | |
if len(sys.argv) >= 2: | |
curDir = sys.argv[1] | |
if len(sys.argv) == 3: | |
if sys.argv[2] == "txt": | |
mode = "--text" | |
elif sys.argv[2] == "html": | |
mode = "--html" | |
elif sys.argv[2] == "xml": | |
mode = "--xml" | |
else: | |
sys.argv[2] == "--metadata" | |
# filelist = glob.glob(os.path.join(curDir, "*.DOC")) | |
filelist = glob.glob(os.path.join(curDir, "*.do*")) | |
for filename in filelist: | |
print("Processing " + filename + "...") | |
newFile = newName(filename, mode) | |
f = open(newFile, "w") | |
subprocess.call(["java", "-jar", tika, mode, filename], stdout=f, timeout=None) | |
f.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment