motatoes · June 27, 2017 23:31
diff --git a/extractHOL.js b/extractHOL.js
 // == This script extracts emails and other data about the UK house of lords from the following url:
 // http://www.parliament.uk/mps-lords-and-offices/lords/s

 var casper = require('casper').create();
 var fs = require('fs');

 // This array maintains a list of links to each HOL profile
 var links = [];

 // This array maintains the scraped information and is saved by the end of this script
 var scrapedRows = [];

 // == Some helper functions == //


 // returns the selector element property if the selector exists but otherwise returns defaultValue
 function querySelectorGet(selector, property, defaultValue) {
 	var item = document.querySelector(selector);
 	// Check that the item is not null
 	item =  item ? item[property] : defaultValue;
 	return item;
 }

 function getLinks() {
 	var links = document.querySelectorAll('table a');
 	return Array.prototype.map.call(links, function(e) {
 		return e.getAttribute('href');
 	});
 }

 function scrapLordDetails(querySelectorGet) {
 	// Get the first 'a' tag that has a 'mailto' href value
 	var email = querySelectorGet("a[href^='mailto']", 'innerHTML', 'N/A').trim()

 	// The rest of the information can be extarcted via ID tags
 	var title = querySelectorGet("div#lords-fulltitle", 'innerHTML', 'N/A').trim()
 	var name = querySelectorGet("div#lords-name", 'innerHTML', 'N/A').trim()
 	var party = querySelectorGet("div#lords-party-group", 'innerHTML', 'N/A').trim()
 	var dateJoined = querySelectorGet("div#joined-lords", 'innerHTML', 'N/A').trim()
 	var phone = querySelectorGet("#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlTelephone", 'innerHTML', 'N/A').trim()
 	var address1 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addPrivateOfficeAddress_rptAddresses_ctl00_pnlAddress', 'innerHTML', 'N/A').trim()
 	var address2 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addPrivateOfficeAddress_rptAddresses_ctl01_pnlAddress', 'innerHTML', 'N/A').trim()

 	// There are multiple social media links possibly ..
 	var socialMediaATags = document.querySelectorAll('#web-social-media a');
 	var socialMedia =  Array.prototype.map.call(socialMediaATags, function(e) {
 		return e.getAttribute('href');
 	})

 	// Return all the rest of the information as a JSON string
 	return {
 		name: name,
 		title: title,
 		email: email,
 		party: party,
 		dateJoined: dateJoined,
 		phone: phone,
 		address1: address1,
 		address2: address2,
 		// There is no fixed number of social media so we join them with a semicolon
 		socialMedia: socialMedia.join(' ; ') 
 	};
 }


 // ==\ Some helper functions \== //

 // == Let the scraping begin == //


 casper.start('http://www.parliament.uk/mps-lords-and-offices/lords/', function() {
 	this.echo( 'Opened main site titled: ' + this.getTitle());
 });

 casper.then( function() {
 	// aggregate all the links to the lord profiles
 	links = this.evaluate(getLinks);

 	this.echo('scraping links ...')
 	// For each link
    casper.eachThen(links, function(response) {
    	casper.open(response.data).then(function() {
    		// We pass the querySelectorGet method to use it within the webpage context
    		var row = this.evaluate(scrapLordDetails, querySelectorGet);
    		scrapedRows.push(row);

    		// Stats display
    		this.echo('Scraped row ' + scrapedRows.length + ' of ' + links.length);
    	});
    });

 });

 casper.then(function() {
 	// We write the data as a JSON file, you can convert it to a csv using: http://konklone.io/json/
 	fs.write('lords.json', JSON.stringify(scrapedRows), 'w')
 });

 casper.run( function() {
 	casper.exit();
 });
diff --git a/ExtractParliament.js b/ExtractParliament.js
 // == This script extracts emails and other data about the UK house of lords from the following url:
 // http://www.parliament.uk/mps-lords-and-offices/mps/

 var casper = require('casper').create();
 var fs = require('fs');

 // This array maintains a list of links to each HOL profile
 var links = [];

 // This array maintains the scraped information and is saved by the end of this script
 var scrapedRows = [];

 // == Some helper functions == //


 // returns the selector element property if the selector exists but otherwise returns defaultValue
 function querySelectorGet(selector, property, defaultValue) {
 	var item = document.querySelector(selector);
 	// Check that the item is not null
 	item =  item ? item[property] : defaultValue;
 	return item;
 }

 function getLinks() {
 	var links = document.querySelectorAll('table a');
 	return Array.prototype.map.call(links, function(e) {
 		return e.getAttribute('href');
 	});
 }

 function scrapLordDetails(querySelectorGet) {
 	// Get the first 'a' tag that has a 'mailto' href value
 	var email = querySelectorGet("a[href^='mailto']", 'innerHTML', 'N/A').trim()

 	// The rest of the information can be extarcted via ID tags
 	var title = querySelectorGet("div#commons-biography-header h1", 'innerHTML', 'N/A').trim()
 	var name = querySelectorGet("div#commons-addressas", 'innerHTML', 'N/A').trim()
 	var party = querySelectorGet("div#commons-party", 'innerHTML', 'N/A').trim()
 	var dateJoined = querySelectorGet("div#joined-lords", 'innerHTML', 'N/A').trim()
 	var phone = querySelectorGet("#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlTelephone", 'innerHTML', 'N/A')
 	var constituency = querySelectorGet("div#commons-constituency", 'innerHTML', 'N/A').trim()
 	var address1 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlAddress', 'innerHTML', 'N/A').trim()
 	var address2 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlTelephone', 'innerHTML', 'N/A').trim()

 	// There are multiple social media links possibly ..
 	var socialMediaATags = document.querySelectorAll('#social-media a');
 	var socialMedia =  Array.prototype.map.call(socialMediaATags, function(e) {
 		return e.getAttribute('href');
 	})
 	// Return all the rest of the information as a JSON string
 	return {
 		name: name,
 		title: title,
 		email: email,
 		party: party,
 		dateJoined: dateJoined,
 		phone: phone,
 		constituency: constituency,
 		address1: address1,
 		address2: address2,
 		// There is no fixed number of social media so we join them with a semicolon
 		socialMedia: socialMedia.join(' ; ') 
 	};
 }


 // ==\ Some helper functions \== //

 // == Let the scraping begin == //


 casper.start('http://www.parliament.uk/mps-lords-and-offices/mps/', function() {
 	this.echo( 'Opened main site titled: ' + this.getTitle());
 });

 casper.then( function() {
 	// aggregate all the links to the lord profiles
 	links = this.evaluate(getLinks);
 	this.echo('scraping links ...')
 	// For each link
    casper.eachThen(links, function(response) {

    	casper.open(response.data).then(function() {
    		// We pass the querySelectorGet method to use it within the webpage context
    		var row = this.evaluate(scrapLordDetails, querySelectorGet);
    		scrapedRows.push(row);

    		// Stats display
    		this.echo('Scraped row ' + scrapedRows.length + ' of ' + links.length);
    	});
    });

 });

 casper.then(function() {
 	// We write the data as a JSON file, you can convert it to a csv using: http://konklone.io/json/
 	fs.write('lords.json', JSON.stringify(scrapedRows), 'w')
 });

 casper.run( function() {
 	casper.exit();
 });
	// == This script extracts emails and other data about the UK house of lords from the following url:
	// http://www.parliament.uk/mps-lords-and-offices/lords/s

	var casper = require('casper').create();
	var fs = require('fs');

	// This array maintains a list of links to each HOL profile
	var links = [];

	// This array maintains the scraped information and is saved by the end of this script
	var scrapedRows = [];

	// == Some helper functions == //


	// returns the selector element property if the selector exists but otherwise returns defaultValue
	function querySelectorGet(selector, property, defaultValue) {
	var item = document.querySelector(selector);
	// Check that the item is not null
	item = item ? item[property] : defaultValue;
	return item;
	}

	function getLinks() {
	var links = document.querySelectorAll('table a');
	return Array.prototype.map.call(links, function(e) {
	return e.getAttribute('href');
	});
	}

	function scrapLordDetails(querySelectorGet) {
	// Get the first 'a' tag that has a 'mailto' href value
	var email = querySelectorGet("a[href^='mailto']", 'innerHTML', 'N/A').trim()

	// The rest of the information can be extarcted via ID tags
	var title = querySelectorGet("div#lords-fulltitle", 'innerHTML', 'N/A').trim()
	var name = querySelectorGet("div#lords-name", 'innerHTML', 'N/A').trim()
	var party = querySelectorGet("div#lords-party-group", 'innerHTML', 'N/A').trim()
	var dateJoined = querySelectorGet("div#joined-lords", 'innerHTML', 'N/A').trim()
	var phone = querySelectorGet("#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addParliamentaryAddress_rptAddresses_ctl00_pnlTelephone", 'innerHTML', 'N/A').trim()
	var address1 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addPrivateOfficeAddress_rptAddresses_ctl00_pnlAddress', 'innerHTML', 'N/A').trim()
	var address2 = querySelectorGet('#ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_addPrivateOfficeAddress_rptAddresses_ctl01_pnlAddress', 'innerHTML', 'N/A').trim()

	// There are multiple social media links possibly ..
	var socialMediaATags = document.querySelectorAll('#web-social-media a');
	var socialMedia = Array.prototype.map.call(socialMediaATags, function(e) {
	return e.getAttribute('href');
	})

	// Return all the rest of the information as a JSON string
	return {
	name: name,
	title: title,
	email: email,
	party: party,
	dateJoined: dateJoined,
	phone: phone,
	address1: address1,
	address2: address2,
	// There is no fixed number of social media so we join them with a semicolon
	socialMedia: socialMedia.join(' ; ')
	};
	}


	// ==\ Some helper functions \== //

	// == Let the scraping begin == //


	casper.start('http://www.parliament.uk/mps-lords-and-offices/lords/', function() {
	this.echo( 'Opened main site titled: ' + this.getTitle());
	});

	casper.then( function() {
	// aggregate all the links to the lord profiles
	links = this.evaluate(getLinks);

	this.echo('scraping links ...')
	// For each link
	casper.eachThen(links, function(response) {
	casper.open(response.data).then(function() {
	// We pass the querySelectorGet method to use it within the webpage context
	var row = this.evaluate(scrapLordDetails, querySelectorGet);
	scrapedRows.push(row);

	// Stats display
	this.echo('Scraped row ' + scrapedRows.length + ' of ' + links.length);
	});
	});

	});

	casper.then(function() {
	// We write the data as a JSON file, you can convert it to a csv using: http://konklone.io/json/
	fs.write('lords.json', JSON.stringify(scrapedRows), 'w')
	});

	casper.run( function() {
	casper.exit();
	});