markandrus · December 13, 2015 19:48
diff --git a/drupal-scrape-1.sh b/drupal-scrape-1.sh
 #!/bin/sh
 <<README

 Requires:

  * curl 7.24
  * tidy 15.10
  * xpath 5.12

 README

 # Variables.
 jar='cookie.txt'
 dir=out
 pages=$dir/pages
 table_html=$dir/table.html
 drupal_url=https://minorityandwomenbusiness.sites.uchicago.edu
 table_url=$drupal_url/page/business-diversity-referral-services
 login_url=$drupal_url/user

 # Reset.
 rm -rf $jar $dir
 mkdir $dir $pages

 # Read username and password (no echo).
 read -p 'Username: ' name
 stty -echo
 read -p 'Password: ' pass; echo
 stty  echo

 # Log in to Drupal, saving cookies.
 echo "Logging in to \`$login_url'."
 curl $login_url                \
  -s                           \
  -c $jar                      \
  -b $jar                      \
  -F "name=$name"              \
  -F "pass=$pass"              \
  -F 'form_id=user_login'      \
  -F 'op=Log in'               \
  --output /dev/null

 # Download the directory.
 echo "Downloading \`$table_url'."
 curl $table_url                \
  -s                           \
  -c $jar                      \
  -b $jar                      \
  | tidy                       \
      -b                       \
      -asxhtml                 \
      --doctype omit           \
      2>/dev/null              \
       >$table_html

 # Extract the URLs.
 echo 'Extracting URLs from table.'
 urls=`xpath $table_html        \
  //tbody/tr/td\[1\]/a/@href   \
  2>&1                         \
  | tail +2                    \
  | sed                        \
      's/-- NODE --$//g;
       s/ href="\(.*\)"/\1/'`

 # Download files.
 for u in $urls; do
  echo "Downloading \`$u'."
  f=`basename $u`.html
  curl $drupal_url$u           \
    -s                         \
    -c $jar                    \
    -b $jar                    \
    --output $pages/$f
 done
	#!/bin/sh
	<<README

	Requires:

	* curl 7.24
	* tidy 15.10
	* xpath 5.12

	README

	# Variables.
	jar='cookie.txt'
	dir=out
	pages=$dir/pages
	table_html=$dir/table.html
	drupal_url=https://minorityandwomenbusiness.sites.uchicago.edu
	table_url=$drupal_url/page/business-diversity-referral-services
	login_url=$drupal_url/user

	# Reset.
	rm -rf $jar $dir
	mkdir $dir $pages

	# Read username and password (no echo).
	read -p 'Username: ' name
	stty -echo
	read -p 'Password: ' pass; echo
	stty echo

	# Log in to Drupal, saving cookies.
	echo "Logging in to \`$login_url'."
	curl $login_url \
	-s \
	-c $jar \
	-b $jar \
	-F "name=$name" \
	-F "pass=$pass" \
	-F 'form_id=user_login' \
	-F 'op=Log in' \
	--output /dev/null

	# Download the directory.
	echo "Downloading \`$table_url'."
	curl $table_url \
	-s \
	-c $jar \
	-b $jar \
	\| tidy \
	-b \
	-asxhtml \
	--doctype omit \
	2>/dev/null \
	>$table_html

	# Extract the URLs.
	echo 'Extracting URLs from table.'
	urls=`xpath $table_html \
	//tbody/tr/td\[1\]/a/@href \
	2>&1 \
	\| tail +2 \
	\| sed \
	's/-- NODE --$//g;
	s/ href="\(.*\)"/\1/'`

	# Download files.
	for u in $urls; do
	echo "Downloading \`$u'."
	f=`basename $u`.html
	curl $drupal_url$u \
	-s \
	-c $jar \
	-b $jar \
	--output $pages/$f
	done
No results found