Last active
December 11, 2016 01:24
-
-
Save tcrosen/d5c3729ffe6c3447f5b2 to your computer and use it in GitHub Desktop.
Kimono Example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var moment = require('moment-range'); | |
var request = require('request'); | |
var _ = require('lodash'); | |
var url = require('url'); | |
var start = new Date(2014, 9, 8); | |
var end = new Date(); | |
var range = moment().range(start, end); | |
// Gets boxscore links from scores page | |
var scoresApi = 'https://www.kimonolabs.com/api/czde4e6c?apikey=abe6b22285a4d123b8d3ed875ac78331&date='; | |
// Gets data from PBP report | |
var pbpApi = 'https://www.kimonolabs.com/api/adflv7dk?apikey=abe6b22285a4d123b8d3ed875ac78331&kimpath4='; | |
// Loop through each date from `start` to `end` | |
// range.by('days', function(m) { | |
// m is simply the moment.js object (eg. Date) | |
// }); | |
// Scrapes the scores page to locate boxscore URLs and extract game IDs for games on a given date | |
// http://www.nhl.com/ice/scores.htm?date=10/17/2014 | |
function collectBoxscoreUrls(date, done) { | |
request(scoresApi + date.format('MM/DD/YYYY'), function(err, response, body) { | |
if (err) { | |
done(err); | |
} | |
var parsed = JSON.parse(body); | |
// now parse each url and pull out the game IDs to pass to the next Kimono API | |
var gameIds = _.map(parsed.results.collection1, function(o) { | |
// http://www.nhl.com/gamecenter/en/boxscore?id=2014020004 ==> 2014020004 | |
var id = url.parse(o.boxscoreLink.href, true).query.id; | |
// the game ID used to retrieve the play-by-play report does not include the year (first 4 characters) | |
// 2014020004 ==> 020004 | |
return id.substr(4); | |
}); | |
// gameIds ==> [ '020001', '020002', '020003', '020004' ] | |
done(null, gameIds); | |
}); | |
} | |
// Scrape data from a play-by-play report. | |
// The parameter passed to Kimono (kimpath4) is the HTML filename which uses the game ID | |
// http://www.nhl.com/scores/htmlreports/20142015/PL010060.HTM | |
function collectPlayByPlayData(gameId, done) { | |
var kimpath4 = 'PL' + gameId + '.HTM'; | |
request(pbpApi + kimpath4, function(err, response, body) { | |
if (err) { | |
done(err); | |
} | |
var parsed = JSON.parse(body); | |
var pbpLogs = _.map(parsed.results.collection1, function(o) { | |
return o; | |
}); | |
done(null, pbpLogs); | |
}); | |
} | |
// Get all game data for a single date | |
function getGames(date) { | |
collectBoxscoreUrls(date, function(err, gameIds) { | |
collectPlayByPlayData(gameIds[0], function(err, pbpLogs) { | |
console.log(pbpLogs); | |
}); | |
}); | |
} | |
getGames(moment(start)); |
Okay I did a bit of work to clean it up and updated to a working version which at the moment collects games for a single date then gets the play-by-play data for a single one of those games. So now the questions are:
- How to run for each game in a day?
- How to run for multiple days?
Okay I think I've figured it out. I didn't realize I could update my APIs via API (confusing!).
Here's an updated version that sets the list of Play by Play report URLs to crawl from games for a day. The only question now is what is the best way to do this for many days...
var moment = require('moment-range');
var request = require('request');
var _ = require('lodash');
var url = require('url');
var fs = require('fs');
var start = new Date(2014, 9, 8);
var end = new Date();
var range = moment().range(start, end);
var apiKey = 'abe6b22285a4d123b8d3ed875ac78331';
// Gets boxscore links from scores page
var scoresApi = 'https://www.kimonolabs.com/api/czde4e6c?apikey=' + apiKey + '&date=';
// Gets data from PBP report
var pbpApi = 'https://www.kimonolabs.com/api/adflv7dk?apikey=' + apiKey;
// Loop through each date from `start` to `end`
// range.by('days', function(m) {
// m is simply the moment.js object (eg. Date)
// });
// Scrapes the scores page to locate boxscore URLs and extract game IDs for games on a given date
// http://www.nhl.com/ice/scores.htm?date=10/17/2014
function collectBoxscoreUrls(date, done) {
request(scoresApi + date.format('MM/DD/YYYY'), function(err, response, body) {
if (err) {
done(err);
}
var parsed = JSON.parse(body);
// now parse each url and pull out the game IDs to pass to the next Kimono API
var gameIds = _.map(parsed.results.collection1, function(o) {
// http://www.nhl.com/gamecenter/en/boxscore?id=2014020004 ==> 2014020004
var id = url.parse(o.boxscoreLink.href, true).query.id;
// the game ID used to retrieve the play-by-play report does not include the year (first 4 characters)
// 2014020004 ==> 020004
return id.substr(4);
});
// gameIds ==> [ '020001', '020002', '020003', '020004' ]
done(null, gameIds);
});
}
// Update the PBP API source URLs to crawl each play-by-play report
function setPbpApiSourceUrls(gameIds, done) {
var sourceUrls = _.map(gameIds, function(gameId) {
return 'http://www.nhl.com/scores/htmlreports/20142015/PL' + gameId + '.HTM';
});
request({
url: 'https://www.kimonolabs.com/kimonoapis/adflv7dk/update',
method: 'POST',
json: true,
body: {
apikey: apiKey,
urls: sourceUrls
}
}, done);
}
// Start crawling PBP reports
function startPbpCrawl(done) {
request({
url: 'https://www.kimonolabs.com/kimonoapis/adflv7dk/startcrawl',
method: 'POST',
json: true,
body: {
apikey: apiKey
}
}, done);
}
// Scrape data from play-by-play reports.
function collectPlayByPlayData(done) {
request(pbpApi, function(err, response, body) {
if (err) {
done(err);
}
var parsed = JSON.parse(body);
fs.writeFile('pbp.json', body);
var pbpLogs = _.map(parsed.results.collection1, function(o) {
return o;
});
done(null, pbpLogs);
});
}
// Get all game data for a single date
function getGames(date) {
collectBoxscoreUrls(date, function(err, gameIds) {
setPbpApiSourceUrls(gameIds, function(err, resp, body) {
if (err) {
console.log(err);
} else {
console.log('API updated to crawl these URLS: ', body.api.instructions.urls);
}
startPbpCrawl(function(err, resp, body) {
if (err) {
console.log(err);
} else {
console.log(body);
}
// collectPlayByPlayData(function(err, pbpLogs) {
// //console.log(pbpLogs);
// });
});
});
});
}
getGames(moment(start));
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
for this last function that starts w/ range.by('days', .... is there a way to slow down the calls? How many calls are being made in what period of time?