Last active
June 27, 2017 23:31
-
-
Save motatoes/792765736826cd3466a0 to your computer and use it in GitHub Desktop.
House of lords info scraping (casperJS)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// == This script extracts emails and other data about the UK house of lords from the following url: | |
// http://www.parliament.uk/mps-lords-and-offices/lords/s | |
var casper = require('casper').create(); | |
var fs = require('fs'); | |
// This array maintains a list of links to each HOL profile | |
var links = []; | |
// This array maintains the scraped information and is saved by the end of this script | |
var scrapedRows = []; | |
// == Some helper functions == // | |
// returns the selector element property if the selector exists but otherwise returns defaultValue | |
function querySelectorGet(selector, property, defaultValue) { | |
var item = document.querySelector(selector); | |
// Check that the item is not null | |
item = item ? item[property] : defaultValue; | |
return item; | |
} | |
function getLinks() { | |
var links = document.querySelectorAll('table a'); | |
return Array.prototype.map.call(links, function(e) { | |
return e.getAttribute('href'); | |
}); | |
} | |
function scrapLordDetails(querySelectorGet) { | |
// Get the first 'a' tag that has a 'mailto' href value | |
var email = querySelectorGet("a[href^='mailto']", 'innerHTML', 'N/A').trim() | |
// The rest of the information can be extarcted via ID tags | |
var title = querySelectorGet("div#lords-fulltitle", 'innerHTML', 'N/A').trim() | |
var name = querySelectorGet("div#lords-name", 'innerHTML', 'N/A').trim() | |
var party = querySelectorGet("div#lords-party-group", 'innerHTML', 'N/A').trim() | |
var dateJoined = querySelectorGet("div#joined-lords", 'innerHTML', 'N/A').trim() | |
var phone = querySelectorGet("#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlTelephone", 'innerHTML', 'N/A').trim() | |
var address1 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addPrivateOfficeAddress_rptAddresses_ctl00_pnlAddress', 'innerHTML', 'N/A').trim() | |
var address2 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addPrivateOfficeAddress_rptAddresses_ctl01_pnlAddress', 'innerHTML', 'N/A').trim() | |
// There are multiple social media links possibly .. | |
var socialMediaATags = document.querySelectorAll('#web-social-media a'); | |
var socialMedia = Array.prototype.map.call(socialMediaATags, function(e) { | |
return e.getAttribute('href'); | |
}) | |
// Return all the rest of the information as a JSON string | |
return { | |
name: name, | |
title: title, | |
email: email, | |
party: party, | |
dateJoined: dateJoined, | |
phone: phone, | |
address1: address1, | |
address2: address2, | |
// There is no fixed number of social media so we join them with a semicolon | |
socialMedia: socialMedia.join(' ; ') | |
}; | |
} | |
// ==\ Some helper functions \== // | |
// == Let the scraping begin == // | |
casper.start('http://www.parliament.uk/mps-lords-and-offices/lords/', function() { | |
this.echo( 'Opened main site titled: ' + this.getTitle()); | |
}); | |
casper.then( function() { | |
// aggregate all the links to the lord profiles | |
links = this.evaluate(getLinks); | |
this.echo('scraping links ...') | |
// For each link | |
casper.eachThen(links, function(response) { | |
casper.open(response.data).then(function() { | |
// We pass the querySelectorGet method to use it within the webpage context | |
var row = this.evaluate(scrapLordDetails, querySelectorGet); | |
scrapedRows.push(row); | |
// Stats display | |
this.echo('Scraped row ' + scrapedRows.length + ' of ' + links.length); | |
}); | |
}); | |
}); | |
casper.then(function() { | |
// We write the data as a JSON file, you can convert it to a csv using: http://konklone.io/json/ | |
fs.write('lords.json', JSON.stringify(scrapedRows), 'w') | |
}); | |
casper.run( function() { | |
casper.exit(); | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// == This script extracts emails and other data about the UK house of lords from the following url: | |
// http://www.parliament.uk/mps-lords-and-offices/mps/ | |
var casper = require('casper').create(); | |
var fs = require('fs'); | |
// This array maintains a list of links to each HOL profile | |
var links = []; | |
// This array maintains the scraped information and is saved by the end of this script | |
var scrapedRows = []; | |
// == Some helper functions == // | |
// returns the selector element property if the selector exists but otherwise returns defaultValue | |
function querySelectorGet(selector, property, defaultValue) { | |
var item = document.querySelector(selector); | |
// Check that the item is not null | |
item = item ? item[property] : defaultValue; | |
return item; | |
} | |
function getLinks() { | |
var links = document.querySelectorAll('table a'); | |
return Array.prototype.map.call(links, function(e) { | |
return e.getAttribute('href'); | |
}); | |
} | |
function scrapLordDetails(querySelectorGet) { | |
// Get the first 'a' tag that has a 'mailto' href value | |
var email = querySelectorGet("a[href^='mailto']", 'innerHTML', 'N/A').trim() | |
// The rest of the information can be extarcted via ID tags | |
var title = querySelectorGet("div#commons-biography-header h1", 'innerHTML', 'N/A').trim() | |
var name = querySelectorGet("div#commons-addressas", 'innerHTML', 'N/A').trim() | |
var party = querySelectorGet("div#commons-party", 'innerHTML', 'N/A').trim() | |
var dateJoined = querySelectorGet("div#joined-lords", 'innerHTML', 'N/A').trim() | |
var phone = querySelectorGet("#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlTelephone", 'innerHTML', 'N/A') | |
var constituency = querySelectorGet("div#commons-constituency", 'innerHTML', 'N/A').trim() | |
var address1 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlAddress', 'innerHTML', 'N/A').trim() | |
var address2 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlTelephone', 'innerHTML', 'N/A').trim() | |
// There are multiple social media links possibly .. | |
var socialMediaATags = document.querySelectorAll('#social-media a'); | |
var socialMedia = Array.prototype.map.call(socialMediaATags, function(e) { | |
return e.getAttribute('href'); | |
}) | |
// Return all the rest of the information as a JSON string | |
return { | |
name: name, | |
title: title, | |
email: email, | |
party: party, | |
dateJoined: dateJoined, | |
phone: phone, | |
constituency: constituency, | |
address1: address1, | |
address2: address2, | |
// There is no fixed number of social media so we join them with a semicolon | |
socialMedia: socialMedia.join(' ; ') | |
}; | |
} | |
// ==\ Some helper functions \== // | |
// == Let the scraping begin == // | |
casper.start('http://www.parliament.uk/mps-lords-and-offices/mps/', function() { | |
this.echo( 'Opened main site titled: ' + this.getTitle()); | |
}); | |
casper.then( function() { | |
// aggregate all the links to the lord profiles | |
links = this.evaluate(getLinks); | |
this.echo('scraping links ...') | |
// For each link | |
casper.eachThen(links, function(response) { | |
casper.open(response.data).then(function() { | |
// We pass the querySelectorGet method to use it within the webpage context | |
var row = this.evaluate(scrapLordDetails, querySelectorGet); | |
scrapedRows.push(row); | |
// Stats display | |
this.echo('Scraped row ' + scrapedRows.length + ' of ' + links.length); | |
}); | |
}); | |
}); | |
casper.then(function() { | |
// We write the data as a JSON file, you can convert it to a csv using: http://konklone.io/json/ | |
fs.write('lords.json', JSON.stringify(scrapedRows), 'w') | |
}); | |
casper.run( function() { | |
casper.exit(); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment