Skip to content

Instantly share code, notes, and snippets.

@throughnothing
Last active May 24, 2016 16:57
Show Gist options
  • Save throughnothing/cbc361f5302eb80deb8de7ea85288c91 to your computer and use it in GitHub Desktop.
Save throughnothing/cbc361f5302eb80deb8de7ea85288c91 to your computer and use it in GitHub Desktop.
Yelp Scraper
#!/usr/bin/env node
var Yelp = require('yelp');
var request = require('superagent');
var superagentPromisePlugin = require('superagent-promise-plugin');
var Immutable = require('immutable');
var range = require('node-range');
var yelp = new Yelp({
consumer_key: '',
consumer_secret: '',
token: '',
token_secret: '',
});
var yelpBizPerPage = 10;
var yelpBizRegex = /"\/biz\/([\w-]+)[\?"]/gi;
// Show help if needed
if(!process.argv[3]){
console.error('Usage: ./scrape-yelp.js "Coppel, TX" [num_pages] [search_phrase] > File.csv');
process.exit(1);
}
Promise.all(getOffsetsFromArgv()
// Get HTML Pages from Yelp
.map(function(offset){
return getPage(getSearchFromArgv(), getLocationFromArgv(), offset)
.catch(function(res){
console.error("Failed scraping Yelp page: ", offset/10);
})
}))
// Parse Business IDs from HTML Page
.then(getBizIdsFromPage)
// Flatten + De-Dupe all yelp businessIds parsed from HTML requests
.then(function(bizIds){ return Immutable.fromJS(bizIds).flatten().toSet(); })
// Retrieve all businesses by id from the yelp API
.then(function(bizIdsSet){ return Promise.all(bizIdsSet.map(getBizData)); })
// Print retrieved business data as CSV
.then(function(bizDatas){ console.log(bizDatas.filter(notNull).map(getBizCsv).join('\n')); })
.catch(console.error);
function notNull(v) { return v != null }
function getOffsetsFromArgv() {
return range(0, process.argv[3]).map(function(i){
return i * yelpBizPerPage;
})
}
function getLocationFromArgv() {
return encodeURIComponent(process.argv[2]);
}
function getSearchFromArgv() {
return encodeURIComponent(process.argv[4] || '');
}
function getPage(search, location, offset) {
return request
.get([
'https://www.yelp.com/search?',
'find_desc=', search,
'&find_loc=', location,
'&start=', offset
].join('')).use(superagentPromisePlugin)
}
function getBizIdsFromPage(responses) {
return responses.map(function(res){
return Immutable.fromJS([res.text.match(yelpBizRegex)]).flatten()
.filter(notNull)
.map(function(e, i){
return e.replace('"/biz/', "").replace(/[\?"]$/, "");
});
})
}
function getBizData(bizId) {
return yelp.business(bizId)
.catch(function(err) { console.error('Error getting biz:', bizId) });
}
function getBizCsv(bizData){
return [
bizData.name,
bizData.rating,
bizData.review_count,
bizData.display_phone,
normalizeAddress(bizData.location.display_address || []),
bizData.location.city,
bizData.location.state_code,
bizData.location.postal_code,
bizData.id,
normalizeCategories(bizData.categories || []),
// Links
bizData.url.replace(/\?.*$/, ''),
facebookSearchLink(bizData.name),
instagramSearchLink(bizData.name),
].map(quote).join(',');
}
function normalizeAddress(address) {
return address.slice(0,address.length-1).join(' ');
}
function normalizeCategories(categories){
return categories
.map(function(sublist){ return sublist[0] })
.join(',')
}
function quote(str) { return '"' + str + '"'; }
function facebookSearchLink(bizName){
return [
"https://www.facebook.com/search/pages/?q=",
encodeURIComponent(bizName),
"&ref=top_filter",
].join('');
}
function instagramSearchLink(bizName){
return [
"https://www.google.com/search?q=",
encodeURIComponent(bizName),
"+site%3Ainstagram.com+-inurl%3A%2Fexplore%2F"
].join('');
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment