markasoftware · May 29, 2020 12:32
diff --git a/acm.bash b/acm.bash
 #!/bin/bash

 # This file is released under the GNU Public License v3

 # ACM scraper during coronavirus
 # Will skip existing PDFs to speed up a resumed download
 # Get the link to the first issue in the journal/SIG/etc, then the scraper will use "next" links to traverse
 # Usage: ./acm.bash first_issue_link output_dir
 # Eg, ./acm.bash https://dl.acm.org/toc/siggraph/1969/3/3 /media/mass/siggraph to download all SIGGRAPH PDFs
 # note: acm does BLOCK IPs after a few hundred PDFs! Uncommenting the sleep statement below to slow things down may help, but I haven't tested

 if ! command -v pup >/dev/null
 then
    echo 'Install PUP: https://github.com/ericchiang/pup/releases'
    exit 1
 fi

 if (( $# < 2 ))
 then
    echo 'Usage: ./acm.bash first_issue_link output_dir'
    exit 1
 fi

 issue_link=$1
 output=${2%/}
 if ! [[ -d "$output" ]]
 then
    echo "Output directory does not exist."
    exit 1
 fi

 cookiejar=$(mktemp)

 curl -sLc "$cookiejar" "https://dl.acm.org" >/dev/null

 while true
 do
    [[ $issue_link = *'toc'* ]] || break
    issue_html=$(curl -sb "$cookiejar" "$issue_link")
    issue_slashes=${issue_link#*toc/*/}
    echo "$issue_link" >&2
    issue_dir="$output/$issue_slashes"
    mkdir -p "$issue_dir"
    echo -n "$issue_html" | pup '.issue-item__title > a' | tr -d \\n | grep -o '<a[^<]*' | while IFS= read -r link
    do
 	title=${link##*\"> }
 	doi=${link#*\"}
 	doi=${doi%%\"*}
 	doi=${doi/abs/pdf}
 	echo "    $title ($doi)" >&2
 	if [[ -e "$issue_dir/$title.pdf" ]]
 	then
 	    echo '    (skipping, already exists)' >&2
 	else
 	    curl -sb "$cookiejar" -o "$issue_dir/$title.pdf" "https://dl.acm.org$doi"
 	    # uncomment to decrease risk of 
 	    # sleep 30
 	fi
    done

    issue_link=$(echo -n "$issue_html" | pup '.content-navigation__btn--next' | grep -o 'href="[^"]*')
    issue_link=${issue_link#href=\"}
    issue_link="https://dl.acm.org$issue_link"
 done

 echo "Done!" >&2
 rm -f "$cookiejar"
	#!/bin/bash

	# This file is released under the GNU Public License v3

	# ACM scraper during coronavirus
	# Will skip existing PDFs to speed up a resumed download
	# Get the link to the first issue in the journal/SIG/etc, then the scraper will use "next" links to traverse
	# Usage: ./acm.bash first_issue_link output_dir
	# Eg, ./acm.bash https://dl.acm.org/toc/siggraph/1969/3/3 /media/mass/siggraph to download all SIGGRAPH PDFs
	# note: acm does BLOCK IPs after a few hundred PDFs! Uncommenting the sleep statement below to slow things down may help, but I haven't tested

	if ! command -v pup >/dev/null
	then
	echo 'Install PUP: https://github.com/ericchiang/pup/releases'
	exit 1
	fi

	if (( $# < 2 ))
	then
	echo 'Usage: ./acm.bash first_issue_link output_dir'
	exit 1
	fi

	issue_link=$1
	output=${2%/}
	if ! [[ -d "$output" ]]
	then
	echo "Output directory does not exist."
	exit 1
	fi

	cookiejar=$(mktemp)

	curl -sLc "$cookiejar" "https://dl.acm.org" >/dev/null

	while true
	do
	[[ $issue_link = 'toc' ]] \|\| break
	issue_html=$(curl -sb "$cookiejar" "$issue_link")
	issue_slashes=${issue_link#toc//}
	echo "$issue_link" >&2
	issue_dir="$output/$issue_slashes"
	mkdir -p "$issue_dir"
	echo -n "$issue_html" \| pup '.issue-item__title > a' \| tr -d \\n \| grep -o '<a[^<]*' \| while IFS= read -r link
	do
	title=${link##*\"> }
	doi=${link#*\"}
	doi=${doi%%\"*}
	doi=${doi/abs/pdf}
	echo " $title ($doi)" >&2
	if [[ -e "$issue_dir/$title.pdf" ]]
	then
	echo ' (skipping, already exists)' >&2
	else
	curl -sb "$cookiejar" -o "$issue_dir/$title.pdf" "https://dl.acm.org$doi"
	# uncomment to decrease risk of
	# sleep 30
	fi
	done

	issue_link=$(echo -n "$issue_html" \| pup '.content-navigation__btn--next' \| grep -o 'href="[^"]*')
	issue_link=${issue_link#href=\"}
	issue_link="https://dl.acm.org$issue_link"
	done

	echo "Done!" >&2
	rm -f "$cookiejar"