Last active
May 24, 2016 16:57
-
-
Save throughnothing/cbc361f5302eb80deb8de7ea85288c91 to your computer and use it in GitHub Desktop.
Yelp Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env node | |
var Yelp = require('yelp'); | |
var request = require('superagent'); | |
var superagentPromisePlugin = require('superagent-promise-plugin'); | |
var Immutable = require('immutable'); | |
var range = require('node-range'); | |
var yelp = new Yelp({ | |
consumer_key: '', | |
consumer_secret: '', | |
token: '', | |
token_secret: '', | |
}); | |
var yelpBizPerPage = 10; | |
var yelpBizRegex = /"\/biz\/([\w-]+)[\?"]/gi; | |
// Show help if needed | |
if(!process.argv[3]){ | |
console.error('Usage: ./scrape-yelp.js "Coppel, TX" [num_pages] [search_phrase] > File.csv'); | |
process.exit(1); | |
} | |
Promise.all(getOffsetsFromArgv() | |
// Get HTML Pages from Yelp | |
.map(function(offset){ | |
return getPage(getSearchFromArgv(), getLocationFromArgv(), offset) | |
.catch(function(res){ | |
console.error("Failed scraping Yelp page: ", offset/10); | |
}) | |
})) | |
// Parse Business IDs from HTML Page | |
.then(getBizIdsFromPage) | |
// Flatten + De-Dupe all yelp businessIds parsed from HTML requests | |
.then(function(bizIds){ return Immutable.fromJS(bizIds).flatten().toSet(); }) | |
// Retrieve all businesses by id from the yelp API | |
.then(function(bizIdsSet){ return Promise.all(bizIdsSet.map(getBizData)); }) | |
// Print retrieved business data as CSV | |
.then(function(bizDatas){ console.log(bizDatas.filter(notNull).map(getBizCsv).join('\n')); }) | |
.catch(console.error); | |
function notNull(v) { return v != null } | |
function getOffsetsFromArgv() { | |
return range(0, process.argv[3]).map(function(i){ | |
return i * yelpBizPerPage; | |
}) | |
} | |
function getLocationFromArgv() { | |
return encodeURIComponent(process.argv[2]); | |
} | |
function getSearchFromArgv() { | |
return encodeURIComponent(process.argv[4] || ''); | |
} | |
function getPage(search, location, offset) { | |
return request | |
.get([ | |
'https://www.yelp.com/search?', | |
'find_desc=', search, | |
'&find_loc=', location, | |
'&start=', offset | |
].join('')).use(superagentPromisePlugin) | |
} | |
function getBizIdsFromPage(responses) { | |
return responses.map(function(res){ | |
return Immutable.fromJS([res.text.match(yelpBizRegex)]).flatten() | |
.filter(notNull) | |
.map(function(e, i){ | |
return e.replace('"/biz/', "").replace(/[\?"]$/, ""); | |
}); | |
}) | |
} | |
function getBizData(bizId) { | |
return yelp.business(bizId) | |
.catch(function(err) { console.error('Error getting biz:', bizId) }); | |
} | |
function getBizCsv(bizData){ | |
return [ | |
bizData.name, | |
bizData.rating, | |
bizData.review_count, | |
bizData.display_phone, | |
normalizeAddress(bizData.location.display_address || []), | |
bizData.location.city, | |
bizData.location.state_code, | |
bizData.location.postal_code, | |
bizData.id, | |
normalizeCategories(bizData.categories || []), | |
// Links | |
bizData.url.replace(/\?.*$/, ''), | |
facebookSearchLink(bizData.name), | |
instagramSearchLink(bizData.name), | |
].map(quote).join(','); | |
} | |
function normalizeAddress(address) { | |
return address.slice(0,address.length-1).join(' '); | |
} | |
function normalizeCategories(categories){ | |
return categories | |
.map(function(sublist){ return sublist[0] }) | |
.join(',') | |
} | |
function quote(str) { return '"' + str + '"'; } | |
function facebookSearchLink(bizName){ | |
return [ | |
"https://www.facebook.com/search/pages/?q=", | |
encodeURIComponent(bizName), | |
"&ref=top_filter", | |
].join(''); | |
} | |
function instagramSearchLink(bizName){ | |
return [ | |
"https://www.google.com/search?q=", | |
encodeURIComponent(bizName), | |
"+site%3Ainstagram.com+-inurl%3A%2Fexplore%2F" | |
].join(''); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment