Last active
June 8, 2019 08:24
-
-
Save markedphillips/dfde64d123cfc1188918e03380ebf461 to your computer and use it in GitHub Desktop.
The following script pulls documents from a few directories, renames them based on last name, first name file keyword, converts the documents to text and prepares them for ingestion to a sqlite3 database where further regex work will be done to extract data from the files. This was based on a file system with /Documents/Smith, Johnson File Close…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
ABS_PATH=.\ | |
# Colourise the output | |
RED='\033[0;31m' # Red | |
GRE='\033[0;32m' # Green | |
YEL='\033[1;33m' # Yellow | |
NCL='\033[0m' # No Color | |
function file_specification() { | |
FILE_NAME="$(basename "${entry}")" | |
DIR="$(dirname "${entry}")" | |
NAME="${FILE_NAME%.*}" | |
EXT="${FILE_NAME##*.}" | |
SIZE="$(du -sh "${entry}" | cut -f1)" | |
printf "%*s${GRE}%s${NCL}\n" $((indent+4)) '' "${entry}" | |
printf "%*s\tFile name:\t${YEL}%s${NCL}\n" $((indent+4)) '' "$FILE_NAME" | |
printf "%*s\tDirectory:\t${YEL}%s${NCL}\n" $((indent+4)) '' "$DIR" | |
printf "%*s\tName only:\t${YEL}%s${NCL}\n" $((indent+4)) '' "$NAME" | |
printf "%*s\tExtension:\t${YEL}%s${NCL}\n" $((indent+4)) '' "$EXT" | |
printf "%*s\tFile size:\t${YEL}%s${NCL}\n" $((indent+4)) '' "$SIZE" | |
} | |
function walk() { | |
local indent="${2:-0}" | |
printf "\n%*s${RED}%s${NCL}\n\n" "$indent" '' "$1" | |
# If the entry is a file do some operations | |
for entry in "$1"/*; do [[ -f "$entry" ]] && file_specification; # done | |
# If the entry is a directory call walk() == create recursion | |
for entry in "$1"/*; do [[ -d "$entry" ]] && walk "$entry" $((indent+4)); # done | |
} | |
# If the path is empty use the current, otherwise convert relative to absolute; Exec walk() | |
function view_dir () { | |
[[ -z "${1}" ]] && ABS_PATH="${PWD}" || cd "${1}" && ABS_PATH="${PWD}" | |
walk "${ABS_PATH}" | |
echo | |
} | |
# Search for key files and rename them based on "Lastname, Firstname" from the directory path and save | |
# in the directory called. | |
TIMESTAMP=`date '+%Y%m%d_%H-%M-%S'` | |
mkdir ${TIMESTAMP} | |
mkdir "${TIMESTAMP}/Keyword" | |
echo "Copying source files" | |
for f in ./**/**/Keyword*.doc; do | |
cp -v "${f}" "${TIMESTAMP}/Keyword/$(echo "${f}" | grep -o '^\.\/[a-zA-Z0-9\-\s]*,\s[a-zA-Z0-9]*')_Keyword.doc" | |
done | |
for f in ./**/**/Keyword*.docx; do | |
cp -v "${f}" "${TIMESTAMP}/Keyword/$(echo "${f}" | grep -o '^\.\/[a-zA-Z0-9\-\s]*,\s[a-zA-Z0-9]*')_Keyword.docx" | |
done | |
echo "Listing contents" | |
ls -al "${TIMESTAMP}"/Keyword | |
sleep 1 # Make sure to get a new timestamp | |
TIMESTAMP_1=`date '+%Y%m%d_%H_-%M-%S'` | |
mkdir -p ${TIMESTAMP_1}/Originals | |
mkdir -p ${TIMESTAMP_1}/Word | |
mkdir -p ${TIMESTAMP_1}/Zip | |
mkdir -p ${TIMESTAMP_1}/Xml | |
mkdir -p ${TIMESTAMP_1}/Txt | |
echo "convert doc to docx" | |
for f in *.doc; do | |
textutil -convert docx "${f}" | |
done | |
for f in *.docx; do | |
textutil -convert txt "${f}" | |
done | |
cp -v *.doc "${TIMESTAMP_1}/Originals/" | |
cp -v *.docx "${TIMESTAMP_1}/Word/" | |
cp -v *.docx "${TIMESTAMP_1}/Xml/" | |
cp -v *.txt "${TIMESTAMP_1}/Txt" | |
cd "${TIMESTAMP_1}/Xml/" | |
for f in *.docx; do | |
new_file="$(echo "${f}" | grep -o '^.*[^.docx]')" | |
cp -v "${f}" "${new_file}.zip" | |
unzip "${f}" -d "${new_file}" | |
done | |
mv -v *.zip ../Zip | |
cd ../../ | |
rm -v *.docx *.doc | |
# Now we have the converted textfiles, clean and prepare for SQLITE3 insertion | |
for f in *.txt; do | |
cat -s "{f}" > "{f}_.txt" | |
done | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment