Created
February 25, 2016 20:39
-
-
Save lovromazgon/ac93d4c0b01f24300d8c to your computer and use it in GitHub Desktop.
Bash script for downloading all content of learnyousomeerlang.com and creating a single html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
FIRST_URL="http://learnyousomeerlang.com/introduction" | |
FOLDER="html" | |
OUTPUT_FILENAME="learnyousomeerlang.html" | |
mkdir $FOLDER | |
url=$FIRST_URL | |
i=1 | |
while true; do | |
echo "Downloading $url" | |
base="$(basename $url)" | |
if [ "$i" -lt 10 ]; then | |
filename="$FOLDER/0$i-$base" | |
else | |
filename="$FOLDER/$i-$base" | |
fi | |
wget -q -O "$filename" "$url" | |
url=`grep -o '.*<a href="[^"]*" title="Next chapter">' "$filename" | sed -nr 's/.*href="([^"]*)" title.*/\1/p'` | |
if [ -z "$url" ] || [ "$url" = "http://learnyousomeerlang.com/content" ]; then | |
break | |
fi | |
i=$((i+1)) | |
done | |
#create one html | |
for f in $FOLDER/*; do | |
xmllint --html --xpath "//div[@id='content']" $f >> $OUTPUT_FILENAME | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment