-
-
Save themorgantown/2660809a0447a9044bdf1c1241a479d2 to your computer and use it in GitHub Desktop.
Convert Word documents into Markdown
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# generate a Markdown version of a word document. Goes in separate folder, since | |
# images are extracted and converted as well (separate folder avoids naming clashes). | |
# | |
# REQUIREMENTS: pandoc | |
# | |
# | |
# with pandoc | |
# --extract-media=[media folder] | |
# | |
# USAGE: | |
# | |
# docx2md.sh a | |
# | |
# This will generate markdown files in a subfolder defined in `ROOTDIR`. Pictures will be moved to `PUBLICDIR`. | |
# ``` | |
which pandoc > /dev/null | |
rc=$? | |
if [[ $rc != 0 ]]; then | |
echo "FATAL missing pandoc. You can install with 'brew install pandoc' or similar" | |
exit 9 | |
fi | |
if [ -z "$1" ]; then | |
echo "Usage:" | |
echo "" | |
echo " docx2md.sh a" | |
exit 13 | |
fi | |
readonly ROOTDIR="~/Desktop/source" | |
readonly PUBLICDIR="~/Desktop/dest" | |
for d in */ ; do | |
mkdir $ROOTDIR$d | |
mkdir $PUBLICDIR$d | |
for fullfile in $d*; do | |
filename=$(basename "$fullfile") | |
extension="${filename##*.}" | |
filename="${filename%.*}" | |
re='^[0-9]+$' | |
if [[ $filename =~ $re ]] ; then | |
mkdir -p $d"pandoc-output" | |
pandoc -f docx -t markdown --extract-media=$d"pandoc-output" -o $ROOTDIR"$d"_"$filename.html.md" "$fullfile" | |
gsed -i -r "s/([a-zA-Z0-9_-]+)\/pandoc-output\/media\/([a-zA-Z0-9]+)/\/media\/\1\/\2/" $ROOTDIR"$d"_"$filename.html.md" | |
fi | |
done | |
done | |
for d in */ ; do | |
cp -r $PWD"/"$d"pandoc-output/media/." $PUBLICDIR$d | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment