Last active
June 13, 2017 15:38
-
-
Save matt2718/e49613b19a168f3099967ba342936129 to your computer and use it in GitHub Desktop.
Scrape quiz bowl packets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# change these depending on where you want to pull from | |
# for the hs archive, the URL should be 'http://www.quizbowlpackets.com' | |
baseurl='http://collegiate.quizbowlpackets.com' | |
outdir='collegiate' | |
curl -s $baseurl | # get index page | |
grep '<SPAN class="Name">' | # extract lines containting packet links | |
sed -e 's/^.*href="//' -e 's/">/*/' -e 's/<\/a>.*$//' | # parse url and name | |
while read -r line; do | |
# parse url and name for each set | |
seturl=$(cut -f 1 -d '*' <(echo $line)) | |
setname=$(cut -f 2 -d '*' <(echo $line)) | |
setname=${setname//\//,} # remove / | |
echo "Downloading packets from $setname" | |
mkdir -p "$outdir/$setname" | |
curl -s "$baseurl/$seturl" | # get packet listing | |
sed -e 's/<\/LI>/\n/g' | # packets show up on different lines | |
grep "href=\"$baseurl/$seturl" | # lines with links to packets | |
while read -r line2; do | |
packurl=$(cut -f 2 -d '"' <(echo $line2)) | |
filename=$(sed 's/.*\///g' <(echo $packurl)) | |
curl -s "$packurl" -o "$outdir/$setname/$filename" | |
done | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment