Created
November 6, 2017 13:51
-
-
Save Macxim/b3cf74d8fced60d8f26a09a4f6b37213 to your computer and use it in GitHub Desktop.
Web scrapping paginated web page with Nightmare.js
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Scrape data from https://london.wtm.com/en/exhibitor-directory-2017/2017-Products | |
* | |
* Usage: $ node index.js | |
*/ | |
var Nightmare = require('nightmare'); | |
var vo = require('vo'); | |
var fs = require('fs'); | |
vo(run)(function(err, result) { | |
if (err) throw err; | |
}); | |
function flatten(arr) { | |
var ret = []; | |
for(var i = 0; i < arr.length; i++) { | |
if(Array.isArray(arr[i])) { | |
ret = ret.concat(flatten(arr[i])); | |
} else { | |
ret.push(arr[i]); | |
} | |
} | |
return ret; | |
// [ [ a ], [ b ], [ c ] ] => [a, b, c] | |
} | |
function outputToCsv(linksArr) { | |
var csvData = [], lineData = []; | |
var header = 'URL, Company, Website, Facebook, Phone'; | |
csvData.push(header); | |
for (var i= 0; i < linksArr.length; i++) { | |
lineData = []; | |
lineData.push(linksArr[i]); | |
csvData.push(lineData.join('\r\n')); | |
} | |
csvData = csvData.join("\r\n"); | |
fs.writeFileSync('./missing-stuff.csv', csvData, function (err) { | |
if (err) return console.log(err); | |
}); | |
} | |
function enquote(val) { | |
if (arguments.length == 0 || val == null) { | |
return '""'; | |
} | |
return '"'+val.toString().replace(/\"/gm,'""')+'"'; | |
} | |
function* run() { | |
var nightmare = Nightmare(), | |
MAX_PAGE = 300, | |
currentPage = 0, | |
links = [], | |
companies = [], | |
websites = [], | |
fbPages = [], | |
phoneNumbers = [], | |
data = []; | |
// Let's get all links from that address below | |
yield nightmare | |
.goto('http://london.wtm.com/en/exhibitor-directory-2017/?rpp=12&d=103087|152_214625') | |
.wait(3000) | |
// Check if `next` button exists | |
nextExists = yield nightmare.visible('.pagination .gButton a'); | |
while (nextExists && currentPage < MAX_PAGE) { | |
links.push(yield nightmare | |
.evaluate(function() { | |
var links = document.querySelectorAll('.resultItem.exhibitor h3 a'); | |
return Array.prototype.map.call(links, function(e) { | |
var href = e.getAttribute('href'); | |
var url = 'https://london.wtm.com' + href; | |
return url; | |
}); | |
}) | |
); | |
yield nightmare | |
.click('.pagination .gButton a') | |
.wait(5000) | |
currentPage++; | |
nextExists = yield nightmare.visible('.pagination .gButton a'); | |
} | |
// There is only one page | |
links.push(yield nightmare | |
.evaluate(function() { | |
var links = document.querySelectorAll('.resultItem.exhibitor h3 a'); | |
return Array.prototype.map.call(links, function(e) { | |
var href = e.getAttribute('href'); | |
var url = 'https://london.wtm.com' + href; | |
return url; | |
}); | |
}) | |
); | |
// We want a clean array of links | |
links = flatten(links) | |
if (links.length > 0) { | |
console.log('βΉοΈ ' + links.length + ' links found. \r'); | |
console.log('π₯ Let\'s do this! Please be aware that this could take a while... Be patient. \n'); | |
console.log('π Retrieving companies names...\r'); | |
for (var i = 0; i < links.length; i++) { | |
yield nightmare | |
.goto(links[i]) | |
// Check if company name exists | |
var hasCompanyName = yield nightmare.exists('h2.exhibitorName') | |
if(hasCompanyName){ | |
yield nightmare | |
.evaluate(() => document.querySelector('h2.exhibitorName').innerText) | |
.then((el) => { | |
companies.push(el); | |
}) | |
} else { | |
companies.push("N/A"); | |
} | |
} | |
console.log('π Retrieving companies websites...\r'); | |
for (var i = 0; i < links.length; i++) { | |
yield nightmare | |
.goto(links[i]) | |
// Check if company has website | |
var hasWebsite = yield nightmare.exists('.socialNetworkProfiles .link a') | |
if(hasWebsite){ | |
yield nightmare | |
.evaluate(() => document.querySelector('.socialNetworkProfiles .link a').href) | |
.then((el) => { | |
websites.push(el); | |
}) | |
} else { | |
websites.push("N/A"); | |
} | |
} | |
console.log('π Retrieving companies Facebook Page...\r'); | |
for (var i = 0; i < links.length; i++) { | |
yield nightmare | |
.goto(links[i]) | |
// Check if company has Facebook Page | |
var hasFbPage = yield nightmare.exists('.socialNetworkProfiles .facebook a') | |
if(hasFbPage){ | |
yield nightmare | |
.evaluate(() => document.querySelector('.socialNetworkProfiles .facebook a').href) | |
.then((el) => { | |
fbPages.push(el); | |
}) | |
} else { | |
fbPages.push("N/A"); | |
} | |
} | |
console.log('βοΈ Retrieving companies phone numbers...\r'); | |
for (var i = 0; i < links.length; i++) { | |
yield nightmare | |
.goto(links[i]) | |
var hasPhoneNumber = yield nightmare.exists('.vcard .tel .value') | |
if(hasPhoneNumber){ | |
yield nightmare | |
.evaluate(() => document.querySelector('.vcard .tel .value').innerText) | |
.then((el) => { | |
phoneNumbers.push(el); | |
}) | |
} else { | |
phoneNumbers.push("N/A"); | |
} | |
} | |
var arrays = [links, companies, websites, fbPages, phoneNumbers]; | |
var data = Array.from(arrays[0], (_, i) => arrays.map(e => [e[i]])) | |
console.log('π Saving results in .csv file...\n'); | |
outputToCsv(data); | |
console.log('π€ Go make some money now!'); | |
yield nightmare.end(); | |
} | |
else { | |
console.log('β οΈ Sorry. No links found. Exiting.'); | |
yield nightmare.end(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment