matt2718 · June 13, 2017 15:38
diff --git a/qb-scrape.sh b/qb-scrape.sh
 #!/bin/bash

 # change these depending on where you want to pull from
 # for the hs archive, the URL should be 'http://www.quizbowlpackets.com'
 baseurl='http://collegiate.quizbowlpackets.com'
 outdir='collegiate'

 curl -s $baseurl | # get index page
 	grep '<SPAN class="Name">' | # extract lines containting packet links
 	sed -e 's/^.*href="//' -e 's/">/*/' -e 's/<\/a>.*$//' | # parse url and name
 	while read -r line; do
 		# parse url and name for each set
 		seturl=$(cut -f 1 -d '*' <(echo $line)) 
 		setname=$(cut -f 2 -d '*' <(echo $line))
 		setname=${setname//\//,} # remove /

 		echo "Downloading packets from $setname"
 		mkdir -p "$outdir/$setname"
 		curl -s "$baseurl/$seturl" | # get packet listing
 			sed -e 's/<\/LI>/\n/g' | # packets show up on different lines
 			grep "href=\"$baseurl/$seturl" | # lines with links to packets
 			while read -r line2; do
 				packurl=$(cut -f 2 -d '"' <(echo $line2))
 				filename=$(sed 's/.*\///g' <(echo $packurl))
 				curl -s "$packurl" -o "$outdir/$setname/$filename"
 			done
 	done
	#!/bin/bash

	# change these depending on where you want to pull from
	# for the hs archive, the URL should be 'http://www.quizbowlpackets.com'
	baseurl='http://collegiate.quizbowlpackets.com'
	outdir='collegiate'

	curl -s $baseurl \| # get index page
	grep '<SPAN class="Name">' \| # extract lines containting packet links
	sed -e 's/^.href="//' -e 's/">//' -e 's/<\/a>.*$//' \| # parse url and name
	while read -r line; do
	# parse url and name for each set
	seturl=$(cut -f 1 -d '*' <(echo $line))
	setname=$(cut -f 2 -d '*' <(echo $line))
	setname=${setname//\//,} # remove /

	echo "Downloading packets from $setname"
	mkdir -p "$outdir/$setname"
	curl -s "$baseurl/$seturl" \| # get packet listing
	sed -e 's/<\/LI>/\n/g' \| # packets show up on different lines
	grep "href=\"$baseurl/$seturl" \| # lines with links to packets
	while read -r line2; do
	packurl=$(cut -f 2 -d '"' <(echo $line2))
	filename=$(sed 's/.*\///g' <(echo $packurl))
	curl -s "$packurl" -o "$outdir/$setname/$filename"
	done
	done