Created
October 16, 2015 10:48
-
-
Save ehrenfeu/c53caebe893345e104d8 to your computer and use it in GitHub Desktop.
"I, Librarian" PDF file exporter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
LIB="/storage/www/librarian/library" | |
DB="${LIB}/database/library.sq3" | |
TGT="/storage/pdf_export" | |
START_ID="4500" | |
STOP_ID="99999999" | |
MAX_LEN="150" | |
# check if connecting and querying the DB works: | |
sqlite3 "${DB}" 'SELECT MAX(id) FROM library;' > /dev/null 2>&1 || { | |
echo "ERROR accessing database: ${DB}" | |
exit 100 | |
} | |
# remember the IFS: | |
OLD_IFS=$IFS | |
# query all required fields from the DB and loop over the results: | |
sqlite3 "${DB}" \ | |
"SELECT file, authors, year, journal, title | |
FROM library | |
WHERE id > '${START_ID}' | |
AND id < '${STOP_ID}';" | | |
while read RESULT ; do | |
# split result into separate parts using the internal field separator (IFS) | |
IFS='|' | |
ARRAY=(${RESULT}) | |
# reset the IFS | |
IFS=$OLD_IFS | |
ORIG=${ARRAY[0]} | |
AUTHOR=${ARRAY[1]} | |
YEAR=${ARRAY[2]} | |
JOURNAL=${ARRAY[3]} | |
TITLE=${ARRAY[4]} | |
if [ "${JOURNAL}" == "" ] ; then | |
echo "JOURNAL-field of '$TITLE' is empty, skipping PDF" | |
continue | |
fi | |
# cut away everything after the first comma, then remove | |
# everything after the last blank (to remove the first name initial) | |
AUTHOR="$(echo ${AUTHOR} | sed 's/,.*// ; s/ [^ ]*$//')" | |
FILENAME="${AUTHOR} ${YEAR} ${JOURNAL} ${TITLE}" | |
# remove special chars | |
# replace multiple consecutive whitespaces by a single one | |
# replace "PNAS" | |
# replace "C elegans" | |
FILENAME="$(echo ${FILENAME} | sed 's/[,:/"?]//g ; | |
s/ / /g ; | |
s/Proc Natl Acad Sci U S A/PNAS/ ; | |
s/Caenorhabditis elegans/C elegans/ ; | |
')" | |
# if title exceeds the maximum length, we have cut it and | |
# we need to remove word-fragments at the end: | |
NAME_LEN="$(echo -n ${FILENAME} | wc -c)" | |
if [ "${NAME_LEN}" -gt "${MAX_LEN}" ] ; then | |
echo "filename-length: $NAME_LEN, cutting:" | |
# cut filename to specified length | |
FILENAME="$(echo ${FILENAME} | | |
cut -c 1-${MAX_LEN} | sed 's/ [^ ]*$//')" | |
fi | |
# remove double full stops: | |
FILENAME="$(echo "${FILENAME}.pdf" | sed 's/\.\././g')" | |
echo -n "processing ${ORIG} -> ${FILENAME} " | |
if ! [ -f "${TGT}/${FILENAME}" ] ; then | |
cp "${LIB}/${ORIG}" "${TGT}/${FILENAME}" && echo "[copied]" | |
# echo "[ NEW ]" | |
else | |
echo | |
fi | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment