collegeman · March 18, 2016 14:53 · collegeman · Mar 17, 2016
diff --git a/cleveland-international-film-festival-crawler-2016.js b/cleveland-international-film-festival-crawler-2016.js
 /*
 To use this script, open Chrome and browse to
 http://www.clevelandfilm.org/schedule
 Open the JavaScript console (CMD + ALT + J on Mac), and
 paste the whole script into the command line.
 The script is asynchronous, meaning that when you run it,
 it will appear to finish immediately, but in fact the
 script makes one request for each film-like thing it finds
 on the schedule page. You can see it running by clicking
 on the Network tab of the debugger window (already open
 if you did CMD + ALT + J above). When all of the
 network requests stop, it's safe to dump the contents
 of the global variable "films". You dump by running
 JSON.stringify(films). The output will be JSON encoded—
 a string that you can then use to generate a CSV. The PHP 
 script I use to transform this JSON file is here:
 https://gist.github.com/collegeman/e243e774d70bb80f7b98
 */
 // this array will store the films:
 var films = []; 
 /**
 * Download, parse, and store details for the film
 * at the given index in films.
 * @param int The index
 */
 function getFilmDetails(i) {
  // get the film at i
 	var film = films[i];
 	// do an async get request to the details page on cleveland's site
 	jQuery.get(film.url, function(html) {
 	  // parse the HTML so that we can extract data from it
 		var details = jQuery(html);
 		// try to get the e-mail address in the right-hand column
 		film.email = details.find('#film-detail a[href^="mailto"]').text();
 		// couldn't find it? we'll look harder
 		if (!film.email) {
 		  // isolate that column
 			var search = details.find('#film-detail').html();
 			// if the column exists (normally it does)...
 			if (search) {
 			  // use a regular express to seek out e-mail addresses
 				var res = search.match(/\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi);
 				// no e-mail address? report as an error in the log
 				if (!res) {
 					console.error("Couldn't identify e-mail", film.url);
 				// otherwise grab the first one we find
 				} else if (res.length) {
 					film.email = res[0];
 				}
 			}
 		}
 		// next look for info elements: year released, run-time, country
 		film.info = [];
 		var info = details.find('p.info').html();
 		if (info) {
 		  // these elements are in a block of text, separated by <br> (linebreak) elements
 			film.info = info.split('<br>').map(function(html) {
 			  // we have to wrap each section in an HTML tag so that we can parse it
 			  var snip = jQuery.trim(jQuery('<span>' + html + '</span>').text());
 			  // then split on the colon, and take the last part (the value, not the label)
 			  return jQuery.trim(snip.split(':')[1]);
 			});
 		}
 		
 		// sometimes the website URL for the film is listed
 		// if it is, it's usually the last anchor tag in the column
 		film.website = details.find('#film-detail a:last').attr('href');
 		if (film.email == film.website) {
 			film.website = null;
 		}
 		
 		// if the URL doesn't have "http" in it, then it's not
 		// a film website, but is instead a reference to a page
 		// on cleveland's site, which we don't care about
 		if (film.website && film.website.indexOf('http') < 0) {
 			film.website = null;
 		}
 		
 		// update the date we have on the film
 		films[i] = film;
 	});
 }

 // this is the bit that goes through the
 // schedule page and finds all the films
 jQuery('.film .title a').each(function() { 
  // for every film we find, grab the title and
  // the URL for the details page, and stash it
  // for later
  films.push({
    title: jQuery(this).text(),
    url: 'http://www.clevelandfilm.org/' + jQuery(this).attr('href')
  });
  // then kick off the process that downloads
  // the details and parses them
  getFilmDetails(films.length-1);
 });
 // wait for crawler to finish, then run JSON.stringify(films)
	/*
	To use this script, open Chrome and browse to
	http://www.clevelandfilm.org/schedule
	Open the JavaScript console (CMD + ALT + J on Mac), and
	paste the whole script into the command line.
	The script is asynchronous, meaning that when you run it,
	it will appear to finish immediately, but in fact the
	script makes one request for each film-like thing it finds
	on the schedule page. You can see it running by clicking
	on the Network tab of the debugger window (already open
	if you did CMD + ALT + J above). When all of the
	network requests stop, it's safe to dump the contents
	of the global variable "films". You dump by running
	JSON.stringify(films). The output will be JSON encoded—
	a string that you can then use to generate a CSV. The PHP
	script I use to transform this JSON file is here:
	https://gist.github.com/collegeman/e243e774d70bb80f7b98
	*/
	// this array will store the films:
	var films = [];
	/**
	* Download, parse, and store details for the film
	* at the given index in films.
	* @param int The index
	*/
	function getFilmDetails(i) {
	// get the film at i
	var film = films[i];
	// do an async get request to the details page on cleveland's site
	jQuery.get(film.url, function(html) {
	// parse the HTML so that we can extract data from it
	var details = jQuery(html);
	// try to get the e-mail address in the right-hand column
	film.email = details.find('#film-detail a[href^="mailto"]').text();
	// couldn't find it? we'll look harder
	if (!film.email) {
	// isolate that column
	var search = details.find('#film-detail').html();
	// if the column exists (normally it does)...
	if (search) {
	// use a regular express to seek out e-mail addresses
	var res = search.match(/\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi);
	// no e-mail address? report as an error in the log
	if (!res) {
	console.error("Couldn't identify e-mail", film.url);
	// otherwise grab the first one we find
	} else if (res.length) {
	film.email = res[0];
	}
	}
	}
	// next look for info elements: year released, run-time, country
	film.info = [];
	var info = details.find('p.info').html();
	if (info) {
	// these elements are in a block of text, separated by <br> (linebreak) elements
	film.info = info.split('<br>').map(function(html) {
	// we have to wrap each section in an HTML tag so that we can parse it
	var snip = jQuery.trim(jQuery('<span>' + html + '</span>').text());
	// then split on the colon, and take the last part (the value, not the label)
	return jQuery.trim(snip.split(':')[1]);
	});
	}

	// sometimes the website URL for the film is listed
	// if it is, it's usually the last anchor tag in the column
	film.website = details.find('#film-detail a:last').attr('href');
	if (film.email == film.website) {
	film.website = null;
	}

	// if the URL doesn't have "http" in it, then it's not
	// a film website, but is instead a reference to a page
	// on cleveland's site, which we don't care about
	if (film.website && film.website.indexOf('http') < 0) {
	film.website = null;
	}

	// update the date we have on the film
	films[i] = film;
	});
	}

	// this is the bit that goes through the
	// schedule page and finds all the films
	jQuery('.film .title a').each(function() {
	// for every film we find, grab the title and
	// the URL for the details page, and stash it
	// for later
	films.push({
	title: jQuery(this).text(),
	url: 'http://www.clevelandfilm.org/' + jQuery(this).attr('href')
	});
	// then kick off the process that downloads
	// the details and parses them
	getFilmDetails(films.length-1);
	});
	// wait for crawler to finish, then run JSON.stringify(films)