Skip to content

Instantly share code, notes, and snippets.

@blmarket
Created July 27, 2012 01:31
Show Gist options
  • Save blmarket/3185663 to your computer and use it in GitHub Desktop.
Save blmarket/3185663 to your computer and use it in GitHub Desktop.
Naver baseball scraper
var you = require('./youtube');
console.log(process.argv);
var id = process.argv[process.argv.length-1];
you.download(id, function(err, res) {
process.exit(0);
});
var request = require('request');
var url = 'http://localhost:3279/add';
request({
method: 'POST',
url: url,
form: {
category: 'kbo',
id: '28178',
title: '7월 27일 롯데vs두산',
description: '하이라이트 : "이종욱 끝내기" 두산 역전승 2위 탈환'
}
}, function(err, res, body) {
console.log(body);
});
var fetch = require('./lib/cache_fetcher').get;
var rss = require('./lib/rss');
var models = require('./lib/models');
var express = require('express');
var path = require('path');
var fs = require('fs');
var app = express.createServer();
app.use(express.methodOverride());
app.use(express.bodyParser());
app.use(app.router);
app.use(express.static(__dirname + "/storage"));
app.get('/rss', function(req, res, next) {
rss(function(err, result) {
console.log(result);
res.header('Content-Type', 'application/rss+xml');
res.end(result);
});
});
app.get('/add/youtube', function(req, res, next) {
res.render('add_youtube');
});
app.post('/add/youtube', function(req, res, next) {
if(!req.body) return next(404);
var id = req.body.id;
if(!id) return next(404);
var key = "youtube:" + id;
fetch(key, function(err, filename) {
res.send('done');
});
});
app.post('/add', function(req, res, next) {
if(!req.body) return next(404);
var id = req.body.id;
var category = req.body.category;
if(!id || !category) return next(404);
var key = "naver:" + category + ":" + id;
var date = new Date();
var title = req.body.title || "No Title";
var description = req.body.description || "No Description";
fetch(key, function(err, filename) {
if(err) return next(err);
models.addtorss({
key: key,
date: date,
title: title,
desc: description
}, function(err, value) {
if(err) return next(err);
res.end('done');
});
});
});
app.get('/get/:key', function(req, res, next) {
var key = req.param('key', null);
if(!key) return next(new Error('Invalid arguments'));
fetch(key, function(err, filename) {
if(err) return next(err);
console.log("sending " + filename);
res.download(filename, path.basename(filename));
//res.sendfile(filename);
//res.redirect(path.basename(filename)); // dirty...
});
});
app.listen(3279);
console.log("Listening 3279");
//
// client.lpush('rss', 'kbo:28179');
//process.exit(0); // explicit exit due to redis client
var step = require('step');
var jsdom = require('jsdom');
var models = require('./models');
var fetch_naver = require('./naver_fetch').fetch_movieclip;
var fetch_youtube = require('./youtube').download;
var client = require('./redis').client;
var extract = require('./extract').extract;
/**
* @param options {Object} video information {type:}, other elements are based on type
* @param callback {Function} result callback, accepts (err, filename)
*/
function get_by_object(options, callback) {
switch(options.type) {
case 'naver': {
fetch_naver(options.category, options.id, callback);
break;
}
case 'youtube': {
fetch_youtube(options.id, callback);
break;
}
}
};
function parse_ogp(window) {
var ret = {};
if(!window) return ret;
var childs = window.document.head.getElementsByTagName('meta');
var len = childs.length;
var mapper = { 'og:title': 'title', 'og:description': 'desc' };
for(var i=0;i<len;i++) {
var c = childs[i];
if(!c) continue;
var prop = c.attributes['property'];
if(!prop || !c.attributes['content']) continue;
var dst = mapper[prop.nodeValue];
if(!dst) continue;
ret[dst] = c.attributes['content'].nodeValue;
}
console.log(ret);
return ret;
}
function getiton(url) {
step(function fetch() {
jsdom.env({ html: url, done: this.parallel() });
//get_by_object(extract(url), this.parallel());
}, function parse(err, window, filename) {
var ogData = parse_ogp(window);
if(!filename) {
console.log("save failed : " + err);
return;
}
var savepath = __dirname + "/../storage";
console.log("Saved to : " + filename);
});
}
module.exports.get_by_url = getiton;
module.exports.get_by_object = get_by_object;
module.exports.get = function(key, callback) {
step(function check_redis() {
models.get_video(key, this);
}, function handle_redisresult(err, res) {
if(err) throw err;
if(res) {
callback(null, res);
} else {
var tmp = key.split(':');
var type = tmp[0];
switch(type) {
case 'naver': {
var category = tmp[1];
var id = tmp[2];
fetch_movieclip(category, id, function(err, filename) {
if(err) throw err;
console.log("File written to " + filename);
models.set_video(key, filename);
callback(null, filename);
});
break;
}
case 'youtube': {
var id = tmp[1];
fetch_youtube(id, callback);
break;
}
}
}
});
};
var fetch = require('../cache_fetcher');
function usage() {
console.log("Usage: [url]");
}
var url = process.argv.pop();
console.log(url);
if(url[0] === '/') { // not a url?
return usage();
}
fetch.get_by_url(url);
#!/usr/bin/env perl
use strict;
use WWW::YouTube::Download;
use Data::Dumper;
use List::Util;
my @quality_preference = ( 84, 85, 22, 82, 83, 18, 37, 38 );
my $id = pop @ARGV;
$_ = WWW::YouTube::Download->new;
my @fmts = @{$_->get_fmt_list($id)};
my $dst_fmt;
foreach my $value (@quality_preference) {
my $idx = List::Util::first { int($_) == int($value) } @fmts;
if ( $idx ) {
$dst_fmt = $idx;
last;
}
}
my $filename = "/tmp/$id.mp4";
$_->download($id, { filename => $filename, fmt => $dst_fmt });
print "$filename";
var urlparse = require('url');
var querystring = require('querystring');
function Dialect_naver(url) {
var naver_london = /sports.news.naver.com\/([^\/]*)\/video.nhn\?(.*)$/;
var match = url.match(naver_london);
if(match) {
var naver_options = {
type: 'naver',
category: match[1],
id: querystring.parse(match[2]).id
}
return naver_options;
}
var naver_videoCenter = /sports.news.naver.com\/videoCenter/;
if(url.match(naver_videoCenter)) {
var query = querystring.parse(urlparse.parse(url).query);
if(query.id && query.category) {
return {
type: 'naver',
category: query.category,
id: query.id
}
}
}
return null;
}
function Dialect_youtube(url) {
var match = url.match(/(youtube.com|youtu.be)\/watch\?(.*)$/);
if(!match) return null;
return {
type: 'youtube',
id: querystring.parse(match[2]).v
}
}
function justyoutube(url) {
return {
type: 'youtube',
id: url
}
};
function cocktail(url) {
var dialects = [ Dialect_naver, Dialect_youtube ];
var len = dialects.length;
for(var i=0;i<len;i++) {
var res = dialects[i](url);
if(res) return res;
}
return null;
}
module.exports.extract = cocktail;
var client = require('./redis').client;
var redis = require('redis');
var hashid = 'scraper';
module.exports.rsslist = function RSSList(callback) {
if(!callback) callback = redis.print;
return client.lrange('rss', 0, -1, callback);
};
module.exports.addtorss = function AddToRSS(item, callback) {
if(!callback) callback = redis.print;
return client.lpush('rss', JSON.stringify(item), callback);
};
module.exports.resetrss = function ResetRSS(callback) {
return client.del('rss', callback);
};
function set_hash(hash, key, value) {
return client.hset(hash, key, value);
};
function get_hash(hash, key, callback) {
client.hget(hash, key, callback);
};
module.exports.set_video = function(key, value) {
set_hash(hashid, key, value);
}
module.exports.get_video = function(key, callback) {
get_hash(hashid, key, callback);
}
var urlparse = require('url');
var request = require('request');
var step = require('step');
var fs = require('fs');
var mkdirp = require('mkdirp');
var temp = require('temp');
var url = 'http://m.sports.naver.com/baseball/gamecenter/kbo/index.nhn?tab=vod&gameId=20120726WOHT0&vodId=28092';
var agent = 'Mozilla/5.0 (iPad; CPU OS 5_1 like Mac OS X; en-us) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B176 Safari/7534.48.3';
function fetch_movieclip(category, id, callback) {
step(function fetch_url() {
request({
method: 'GET',
url: url,
headers: {
'User-Agent': agent
}
}, this);
}, function handle_response(err, res, body) {
if(err) throw err;
var obj = urlparse.parse(url);
var vodurl = urlparse.format({
protocol: 'http',
host: obj.host,
pathname: '/vod.nhn'
});
console.log("try fetching... " + vodurl);
request({
method: 'POST',
url: vodurl,
headers: {
'User-Agent': agent,
'Origin': 'http://m.sports.naver.com',
'Host': 'm.sports.naver.com',
'Referer': url
},
form: {
id: id,
category: category
}
}, this);
}, function handle_vodresponse(err, res, body) {
if(err) throw err;
if(res.statusCode != 200) throw new Error("Invalid statusCode...");
var haha = JSON.parse(body).url;
var filename = temp.path({suffix: '.mp4'});
var ost = fs.createWriteStream(filename);
var req = request({
method: 'GET',
url: haha,
headers: {
'User-Agent': agent,
'Origin': 'http://m.sports.naver.com',
'Host': 'm.sports.naver.com',
'Referer': url
}
});
req.pipe(ost);
req.on('end', function() {
ost.end();
callback(null, filename);
});
});
};
module.exports.fetch_movieclip = fetch_movieclip;
{
"name": "naverscraper",
"version": "0.0.0",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"repository": {
"type": "git",
"url": "[email protected]:3185663.git"
},
"dependencies": {
"request": "latest",
"step": "latest",
"jsdom": "~0.2.15",
"mkdirp": "latest",
"redis": "latest",
"express": "latest",
"jstoxml": "latest",
"fluent-ffmpeg": "~1.1.7",
"temp": "~0.4.0"
},
"author": "blmarket",
"license": "BSD"
}
var redis = require('redis');
var client = redis.createClient();
client.on('error', function(err) {
throw err;
});
process.on('exit', function() {
client.quit();
});
module.exports.client = client;
var model = require('./models');
var jstoxml = require('jstoxml');
module.exports = function(callback) {
model.rsslist(function(err, res) {
if(err) callback(err);
var ret = {
_name: 'rss',
_attrs: {
'xmlns:itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
version: '2.0'
},
_content: {
channel: [
{title: 'my favourites'},
{link: 'black market'},
{description: "oh yes I don't know about copyrights"},
{lastBuildDate: function() { return new Date(); }},
{pubDate: function() { return new Date(); }},
{language: 'ko_KR'},
{
_name: 'itunes:image',
_attrs: {
href: 'http://naver.com/favicon.ico'
}
}
]
}
};
var target = ret._content.channel;
(function() {
if(!res) return;
var length = res.length;
for(var i=0;i<length;i++) {
var obj = JSON.parse(res[i]);
var tmp = obj.key.split(':');
var category = tmp[0];
var id = tmp[1];
var type = obj.type || 'video/x-flv';
target[target.length] = {
item: [
{ title: obj.title },
{
_name: 'enclosure',
_attrs: {
url: 'http://192.168.0.4:3279/get/' + obj.key,
type: type
}
},
{ description: obj.desc },
{ pubDate: function() {
return new Date();
}}
]
}
}
})();
callback(null, jstoxml.toXML(ret, {header: true, indent: ' '}));
});
};
var spawn = require('child_process').spawn;
var fs = require('fs');
var path = require('path');
var mkdirp = require('mkdirp');
var models = require('./models');
function Download(id, callback) {
var child = spawn('../perl/download.pl', [ id ]);
var filename = '';
child.stdin.end();
child.stdout.on('data', function(chunk) {
filename = filename + chunk.toString();
});
child.stdout.on('end', function() {
callback(null, filename);
});
}
// Download('w5hgaNxzORs');
module.exports.download = Download;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment