Skip to content

Instantly share code, notes, and snippets.

@natebeaty
Created February 3, 2022 18:30
Show Gist options
  • Save natebeaty/33484820f1a05c647154de2b1bba1d15 to your computer and use it in GitHub Desktop.
Save natebeaty/33484820f1a05c647154de2b1bba1d15 to your computer and use it in GitHub Desktop.
process subsubdirs of $1 to extract magazine data
#!/bin/bash
# exit function
die() {
echo >&2 "$0 ERROR: $@"; exit 1;
}
[ "$1" ] || die "Argument missing."
[ -d "$1" ] || die "Arg '$1' is not a directory."
cd "$1" || die "Can't access '$1'."
dirs=(*)
[ -d "$dirs" ] || die "No dirs found."
# loop through each subdirectory of $1
for d in "${dirs[@]}";do
cd "$d"
dirs2=(*)
[ -d "$dirs2" ] || die "No dirs2 found."
# loop through each subsubdirectory
for d2 in "${dirs2[@]}";do
# abort if no mag.html
[ -f "$d2/mag.html" ] || die "No $d2/mag.html"
# extract title from <title> tag
title=$(awk -vRS="</title>" '/<title>/{gsub(/.*<title>|\n+/,"");print;exit}' "$d2/mag.html")
# extract TOC from js array midway in the HTML and convert to json format, remove last comma
toc=$(awk '/contents in the format/,/\};/' "$d2/mag.html" | tail +2 | tac | sed '1,2 d' | tac | awk '!/\/\//' | sed 's/\[ /\{ "name": /g' | sed -E 's/([0-9]+) \]/"page": "\1" }/g' | perl -00pe 's/,(?!.*,)//s' )
# slugify title
slug="$(echo -n "${title}" | sed -e 's/[^[:alnum:]]/-/g' | tr -s '-' | tr A-Z a-z)"
# create json file
touch "$d2/mag.json"
echo -e "{\n \"title\": \"$title\",\n \"slug\": \"$slug\",\n \"toc\": [\n$toc\n ]\n}" > "$d2/mag.json"
# rename dir to slug (if doesn't exist already)
if [ ! -d "$slug" ]; then
mv "$d2" "$slug"
fi
# output for future batch processing
echo "title=\"$title\", dir=\"$1/$d/$slug\""
done
cd ..
done
cd ..
echo "WTF DONE"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment