Last active
March 27, 2017 20:47
-
-
Save ericleasemorgan/c4e34ffad96c0221f1ff to your computer and use it in GitHub Desktop.
(brain-dead) shell script using TIKA in server mode to convert a batch of files to plain text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# tika2text.sh - given a directory, recursively extract text frome files | |
# Eric Lease Morgan <[email protected]> | |
# (c) University of Notre Dame, distributed under a GNU Public License | |
# March 27, 2017 - a second cut; works with a directory | |
# configure | |
HOST='127.0.0.1' | |
TIKA='/Users/emorgan/desktop/tika.jar' | |
MODE="--text" | |
PORT=12345 | |
# get input and sanity check | |
DIRECTORY=$@ | |
if [[ -z $DIRECTORY ]]; then | |
printf "Usage: $0 <directory>\n" | |
exit | |
fi | |
# start TIKA | |
printf "Starting TIKA. Please wait... " 1>&2 | |
java -jar $TIKA $MODE --server -port $PORT & | |
PID=$! | |
printf "($PID)\n" 1>&2 | |
sleep 10 | |
# set up the environment; error check | |
cd "$DIRECTORY" | |
# process each file in the directory | |
for FILE in ./*; | |
do | |
# only want files, not directories | |
if [[ -f $FILE ]]; then | |
# do the work and send it to STDOUT | |
nc $HOST $PORT < "$FILE" | |
fi | |
done | |
# stop TIKA | |
printf "Stopping TIKA ($PID)..." 1>&2 | |
kill $PID | |
printf "done\n" 1>&2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment