Created
February 3, 2022 18:30
-
-
Save natebeaty/33484820f1a05c647154de2b1bba1d15 to your computer and use it in GitHub Desktop.
process subsubdirs of $1 to extract magazine data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# exit function | |
die() { | |
echo >&2 "$0 ERROR: $@"; exit 1; | |
} | |
[ "$1" ] || die "Argument missing." | |
[ -d "$1" ] || die "Arg '$1' is not a directory." | |
cd "$1" || die "Can't access '$1'." | |
dirs=(*) | |
[ -d "$dirs" ] || die "No dirs found." | |
# loop through each subdirectory of $1 | |
for d in "${dirs[@]}";do | |
cd "$d" | |
dirs2=(*) | |
[ -d "$dirs2" ] || die "No dirs2 found." | |
# loop through each subsubdirectory | |
for d2 in "${dirs2[@]}";do | |
# abort if no mag.html | |
[ -f "$d2/mag.html" ] || die "No $d2/mag.html" | |
# extract title from <title> tag | |
title=$(awk -vRS="</title>" '/<title>/{gsub(/.*<title>|\n+/,"");print;exit}' "$d2/mag.html") | |
# extract TOC from js array midway in the HTML and convert to json format, remove last comma | |
toc=$(awk '/contents in the format/,/\};/' "$d2/mag.html" | tail +2 | tac | sed '1,2 d' | tac | awk '!/\/\//' | sed 's/\[ /\{ "name": /g' | sed -E 's/([0-9]+) \]/"page": "\1" }/g' | perl -00pe 's/,(?!.*,)//s' ) | |
# slugify title | |
slug="$(echo -n "${title}" | sed -e 's/[^[:alnum:]]/-/g' | tr -s '-' | tr A-Z a-z)" | |
# create json file | |
touch "$d2/mag.json" | |
echo -e "{\n \"title\": \"$title\",\n \"slug\": \"$slug\",\n \"toc\": [\n$toc\n ]\n}" > "$d2/mag.json" | |
# rename dir to slug (if doesn't exist already) | |
if [ ! -d "$slug" ]; then | |
mv "$d2" "$slug" | |
fi | |
# output for future batch processing | |
echo "title=\"$title\", dir=\"$1/$d/$slug\"" | |
done | |
cd .. | |
done | |
cd .. | |
echo "WTF DONE" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment