Last active
March 18, 2016 14:53
-
-
Save collegeman/0275f257bf0808f9b0bd to your computer and use it in GitHub Desktop.
Paste into Chrome console, crawl films.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
To use this script, open Chrome and browse to | |
http://www.clevelandfilm.org/schedule | |
Open the JavaScript console (CMD + ALT + J on Mac), and | |
paste the whole script into the command line. | |
The script is asynchronous, meaning that when you run it, | |
it will appear to finish immediately, but in fact the | |
script makes one request for each film-like thing it finds | |
on the schedule page. You can see it running by clicking | |
on the Network tab of the debugger window (already open | |
if you did CMD + ALT + J above). When all of the | |
network requests stop, it's safe to dump the contents | |
of the global variable "films". You dump by running | |
JSON.stringify(films). The output will be JSON encoded— | |
a string that you can then use to generate a CSV. The PHP | |
script I use to transform this JSON file is here: | |
https://gist.github.com/collegeman/e243e774d70bb80f7b98 | |
*/ | |
// this array will store the films: | |
var films = []; | |
/** | |
* Download, parse, and store details for the film | |
* at the given index in films. | |
* @param int The index | |
*/ | |
function getFilmDetails(i) { | |
// get the film at i | |
var film = films[i]; | |
// do an async get request to the details page on cleveland's site | |
jQuery.get(film.url, function(html) { | |
// parse the HTML so that we can extract data from it | |
var details = jQuery(html); | |
// try to get the e-mail address in the right-hand column | |
film.email = details.find('#film-detail a[href^="mailto"]').text(); | |
// couldn't find it? we'll look harder | |
if (!film.email) { | |
// isolate that column | |
var search = details.find('#film-detail').html(); | |
// if the column exists (normally it does)... | |
if (search) { | |
// use a regular express to seek out e-mail addresses | |
var res = search.match(/\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi); | |
// no e-mail address? report as an error in the log | |
if (!res) { | |
console.error("Couldn't identify e-mail", film.url); | |
// otherwise grab the first one we find | |
} else if (res.length) { | |
film.email = res[0]; | |
} | |
} | |
} | |
// next look for info elements: year released, run-time, country | |
film.info = []; | |
var info = details.find('p.info').html(); | |
if (info) { | |
// these elements are in a block of text, separated by <br> (linebreak) elements | |
film.info = info.split('<br>').map(function(html) { | |
// we have to wrap each section in an HTML tag so that we can parse it | |
var snip = jQuery.trim(jQuery('<span>' + html + '</span>').text()); | |
// then split on the colon, and take the last part (the value, not the label) | |
return jQuery.trim(snip.split(':')[1]); | |
}); | |
} | |
// sometimes the website URL for the film is listed | |
// if it is, it's usually the last anchor tag in the column | |
film.website = details.find('#film-detail a:last').attr('href'); | |
if (film.email == film.website) { | |
film.website = null; | |
} | |
// if the URL doesn't have "http" in it, then it's not | |
// a film website, but is instead a reference to a page | |
// on cleveland's site, which we don't care about | |
if (film.website && film.website.indexOf('http') < 0) { | |
film.website = null; | |
} | |
// update the date we have on the film | |
films[i] = film; | |
}); | |
} | |
// this is the bit that goes through the | |
// schedule page and finds all the films | |
jQuery('.film .title a').each(function() { | |
// for every film we find, grab the title and | |
// the URL for the details page, and stash it | |
// for later | |
films.push({ | |
title: jQuery(this).text(), | |
url: 'http://www.clevelandfilm.org/' + jQuery(this).attr('href') | |
}); | |
// then kick off the process that downloads | |
// the details and parses them | |
getFilmDetails(films.length-1); | |
}); | |
// wait for crawler to finish, then run JSON.stringify(films) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
We discovered that sometimes e-mail addresses appear in the page but are not hyperlinked, so we modified the script.