Last active
May 29, 2020 12:32
-
-
Save markasoftware/12a0b08b2d68b90ca4b40bacae5d79b7 to your computer and use it in GitHub Desktop.
ACM Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This file is released under the GNU Public License v3 | |
# ACM scraper during coronavirus | |
# Will skip existing PDFs to speed up a resumed download | |
# Get the link to the first issue in the journal/SIG/etc, then the scraper will use "next" links to traverse | |
# Usage: ./acm.bash first_issue_link output_dir | |
# Eg, ./acm.bash https://dl.acm.org/toc/siggraph/1969/3/3 /media/mass/siggraph to download all SIGGRAPH PDFs | |
# note: acm does BLOCK IPs after a few hundred PDFs! Uncommenting the sleep statement below to slow things down may help, but I haven't tested | |
if ! command -v pup >/dev/null | |
then | |
echo 'Install PUP: https://github.com/ericchiang/pup/releases' | |
exit 1 | |
fi | |
if (( $# < 2 )) | |
then | |
echo 'Usage: ./acm.bash first_issue_link output_dir' | |
exit 1 | |
fi | |
issue_link=$1 | |
output=${2%/} | |
if ! [[ -d "$output" ]] | |
then | |
echo "Output directory does not exist." | |
exit 1 | |
fi | |
cookiejar=$(mktemp) | |
curl -sLc "$cookiejar" "https://dl.acm.org" >/dev/null | |
while true | |
do | |
[[ $issue_link = *'toc'* ]] || break | |
issue_html=$(curl -sb "$cookiejar" "$issue_link") | |
issue_slashes=${issue_link#*toc/*/} | |
echo "$issue_link" >&2 | |
issue_dir="$output/$issue_slashes" | |
mkdir -p "$issue_dir" | |
echo -n "$issue_html" | pup '.issue-item__title > a' | tr -d \\n | grep -o '<a[^<]*' | while IFS= read -r link | |
do | |
title=${link##*\"> } | |
doi=${link#*\"} | |
doi=${doi%%\"*} | |
doi=${doi/abs/pdf} | |
echo " $title ($doi)" >&2 | |
if [[ -e "$issue_dir/$title.pdf" ]] | |
then | |
echo ' (skipping, already exists)' >&2 | |
else | |
curl -sb "$cookiejar" -o "$issue_dir/$title.pdf" "https://dl.acm.org$doi" | |
# uncomment to decrease risk of | |
# sleep 30 | |
fi | |
done | |
issue_link=$(echo -n "$issue_html" | pup '.content-navigation__btn--next' | grep -o 'href="[^"]*') | |
issue_link=${issue_link#href=\"} | |
issue_link="https://dl.acm.org$issue_link" | |
done | |
echo "Done!" >&2 | |
rm -f "$cookiejar" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment