Created
July 27, 2012 01:31
-
-
Save blmarket/3185663 to your computer and use it in GitHub Desktop.
Naver baseball scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var you = require('./youtube'); | |
console.log(process.argv); | |
var id = process.argv[process.argv.length-1]; | |
you.download(id, function(err, res) { | |
process.exit(0); | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var request = require('request'); | |
var url = 'http://localhost:3279/add'; | |
request({ | |
method: 'POST', | |
url: url, | |
form: { | |
category: 'kbo', | |
id: '28178', | |
title: '7월 27일 롯데vs두산', | |
description: '하이라이트 : "이종욱 끝내기" 두산 역전승 2위 탈환' | |
} | |
}, function(err, res, body) { | |
console.log(body); | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fetch = require('./lib/cache_fetcher').get; | |
var rss = require('./lib/rss'); | |
var models = require('./lib/models'); | |
var express = require('express'); | |
var path = require('path'); | |
var fs = require('fs'); | |
var app = express.createServer(); | |
app.use(express.methodOverride()); | |
app.use(express.bodyParser()); | |
app.use(app.router); | |
app.use(express.static(__dirname + "/storage")); | |
app.get('/rss', function(req, res, next) { | |
rss(function(err, result) { | |
console.log(result); | |
res.header('Content-Type', 'application/rss+xml'); | |
res.end(result); | |
}); | |
}); | |
app.get('/add/youtube', function(req, res, next) { | |
res.render('add_youtube'); | |
}); | |
app.post('/add/youtube', function(req, res, next) { | |
if(!req.body) return next(404); | |
var id = req.body.id; | |
if(!id) return next(404); | |
var key = "youtube:" + id; | |
fetch(key, function(err, filename) { | |
res.send('done'); | |
}); | |
}); | |
app.post('/add', function(req, res, next) { | |
if(!req.body) return next(404); | |
var id = req.body.id; | |
var category = req.body.category; | |
if(!id || !category) return next(404); | |
var key = "naver:" + category + ":" + id; | |
var date = new Date(); | |
var title = req.body.title || "No Title"; | |
var description = req.body.description || "No Description"; | |
fetch(key, function(err, filename) { | |
if(err) return next(err); | |
models.addtorss({ | |
key: key, | |
date: date, | |
title: title, | |
desc: description | |
}, function(err, value) { | |
if(err) return next(err); | |
res.end('done'); | |
}); | |
}); | |
}); | |
app.get('/get/:key', function(req, res, next) { | |
var key = req.param('key', null); | |
if(!key) return next(new Error('Invalid arguments')); | |
fetch(key, function(err, filename) { | |
if(err) return next(err); | |
console.log("sending " + filename); | |
res.download(filename, path.basename(filename)); | |
//res.sendfile(filename); | |
//res.redirect(path.basename(filename)); // dirty... | |
}); | |
}); | |
app.listen(3279); | |
console.log("Listening 3279"); | |
// | |
// client.lpush('rss', 'kbo:28179'); | |
//process.exit(0); // explicit exit due to redis client | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var step = require('step'); | |
var jsdom = require('jsdom'); | |
var models = require('./models'); | |
var fetch_naver = require('./naver_fetch').fetch_movieclip; | |
var fetch_youtube = require('./youtube').download; | |
var client = require('./redis').client; | |
var extract = require('./extract').extract; | |
/** | |
* @param options {Object} video information {type:}, other elements are based on type | |
* @param callback {Function} result callback, accepts (err, filename) | |
*/ | |
function get_by_object(options, callback) { | |
switch(options.type) { | |
case 'naver': { | |
fetch_naver(options.category, options.id, callback); | |
break; | |
} | |
case 'youtube': { | |
fetch_youtube(options.id, callback); | |
break; | |
} | |
} | |
}; | |
function parse_ogp(window) { | |
var ret = {}; | |
if(!window) return ret; | |
var childs = window.document.head.getElementsByTagName('meta'); | |
var len = childs.length; | |
var mapper = { 'og:title': 'title', 'og:description': 'desc' }; | |
for(var i=0;i<len;i++) { | |
var c = childs[i]; | |
if(!c) continue; | |
var prop = c.attributes['property']; | |
if(!prop || !c.attributes['content']) continue; | |
var dst = mapper[prop.nodeValue]; | |
if(!dst) continue; | |
ret[dst] = c.attributes['content'].nodeValue; | |
} | |
console.log(ret); | |
return ret; | |
} | |
function getiton(url) { | |
step(function fetch() { | |
jsdom.env({ html: url, done: this.parallel() }); | |
//get_by_object(extract(url), this.parallel()); | |
}, function parse(err, window, filename) { | |
var ogData = parse_ogp(window); | |
if(!filename) { | |
console.log("save failed : " + err); | |
return; | |
} | |
var savepath = __dirname + "/../storage"; | |
console.log("Saved to : " + filename); | |
}); | |
} | |
module.exports.get_by_url = getiton; | |
module.exports.get_by_object = get_by_object; | |
module.exports.get = function(key, callback) { | |
step(function check_redis() { | |
models.get_video(key, this); | |
}, function handle_redisresult(err, res) { | |
if(err) throw err; | |
if(res) { | |
callback(null, res); | |
} else { | |
var tmp = key.split(':'); | |
var type = tmp[0]; | |
switch(type) { | |
case 'naver': { | |
var category = tmp[1]; | |
var id = tmp[2]; | |
fetch_movieclip(category, id, function(err, filename) { | |
if(err) throw err; | |
console.log("File written to " + filename); | |
models.set_video(key, filename); | |
callback(null, filename); | |
}); | |
break; | |
} | |
case 'youtube': { | |
var id = tmp[1]; | |
fetch_youtube(id, callback); | |
break; | |
} | |
} | |
} | |
}); | |
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fetch = require('../cache_fetcher'); | |
function usage() { | |
console.log("Usage: [url]"); | |
} | |
var url = process.argv.pop(); | |
console.log(url); | |
if(url[0] === '/') { // not a url? | |
return usage(); | |
} | |
fetch.get_by_url(url); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use WWW::YouTube::Download; | |
use Data::Dumper; | |
use List::Util; | |
my @quality_preference = ( 84, 85, 22, 82, 83, 18, 37, 38 ); | |
my $id = pop @ARGV; | |
$_ = WWW::YouTube::Download->new; | |
my @fmts = @{$_->get_fmt_list($id)}; | |
my $dst_fmt; | |
foreach my $value (@quality_preference) { | |
my $idx = List::Util::first { int($_) == int($value) } @fmts; | |
if ( $idx ) { | |
$dst_fmt = $idx; | |
last; | |
} | |
} | |
my $filename = "/tmp/$id.mp4"; | |
$_->download($id, { filename => $filename, fmt => $dst_fmt }); | |
print "$filename"; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var urlparse = require('url'); | |
var querystring = require('querystring'); | |
function Dialect_naver(url) { | |
var naver_london = /sports.news.naver.com\/([^\/]*)\/video.nhn\?(.*)$/; | |
var match = url.match(naver_london); | |
if(match) { | |
var naver_options = { | |
type: 'naver', | |
category: match[1], | |
id: querystring.parse(match[2]).id | |
} | |
return naver_options; | |
} | |
var naver_videoCenter = /sports.news.naver.com\/videoCenter/; | |
if(url.match(naver_videoCenter)) { | |
var query = querystring.parse(urlparse.parse(url).query); | |
if(query.id && query.category) { | |
return { | |
type: 'naver', | |
category: query.category, | |
id: query.id | |
} | |
} | |
} | |
return null; | |
} | |
function Dialect_youtube(url) { | |
var match = url.match(/(youtube.com|youtu.be)\/watch\?(.*)$/); | |
if(!match) return null; | |
return { | |
type: 'youtube', | |
id: querystring.parse(match[2]).v | |
} | |
} | |
function justyoutube(url) { | |
return { | |
type: 'youtube', | |
id: url | |
} | |
}; | |
function cocktail(url) { | |
var dialects = [ Dialect_naver, Dialect_youtube ]; | |
var len = dialects.length; | |
for(var i=0;i<len;i++) { | |
var res = dialects[i](url); | |
if(res) return res; | |
} | |
return null; | |
} | |
module.exports.extract = cocktail; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var client = require('./redis').client; | |
var redis = require('redis'); | |
var hashid = 'scraper'; | |
module.exports.rsslist = function RSSList(callback) { | |
if(!callback) callback = redis.print; | |
return client.lrange('rss', 0, -1, callback); | |
}; | |
module.exports.addtorss = function AddToRSS(item, callback) { | |
if(!callback) callback = redis.print; | |
return client.lpush('rss', JSON.stringify(item), callback); | |
}; | |
module.exports.resetrss = function ResetRSS(callback) { | |
return client.del('rss', callback); | |
}; | |
function set_hash(hash, key, value) { | |
return client.hset(hash, key, value); | |
}; | |
function get_hash(hash, key, callback) { | |
client.hget(hash, key, callback); | |
}; | |
module.exports.set_video = function(key, value) { | |
set_hash(hashid, key, value); | |
} | |
module.exports.get_video = function(key, callback) { | |
get_hash(hashid, key, callback); | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "naverscraper", | |
"version": "0.0.0", | |
"main": "index.js", | |
"scripts": { | |
"test": "echo \"Error: no test specified\" && exit 1" | |
}, | |
"repository": { | |
"type": "git", | |
"url": "[email protected]:3185663.git" | |
}, | |
"dependencies": { | |
"request": "latest", | |
"step": "latest", | |
"jsdom": "~0.2.15", | |
"mkdirp": "latest", | |
"redis": "latest", | |
"express": "latest", | |
"jstoxml": "latest", | |
"fluent-ffmpeg": "~1.1.7", | |
"temp": "~0.4.0" | |
}, | |
"author": "blmarket", | |
"license": "BSD" | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var redis = require('redis'); | |
var client = redis.createClient(); | |
client.on('error', function(err) { | |
throw err; | |
}); | |
process.on('exit', function() { | |
client.quit(); | |
}); | |
module.exports.client = client; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var model = require('./models'); | |
var jstoxml = require('jstoxml'); | |
module.exports = function(callback) { | |
model.rsslist(function(err, res) { | |
if(err) callback(err); | |
var ret = { | |
_name: 'rss', | |
_attrs: { | |
'xmlns:itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', | |
version: '2.0' | |
}, | |
_content: { | |
channel: [ | |
{title: 'my favourites'}, | |
{link: 'black market'}, | |
{description: "oh yes I don't know about copyrights"}, | |
{lastBuildDate: function() { return new Date(); }}, | |
{pubDate: function() { return new Date(); }}, | |
{language: 'ko_KR'}, | |
{ | |
_name: 'itunes:image', | |
_attrs: { | |
href: 'http://naver.com/favicon.ico' | |
} | |
} | |
] | |
} | |
}; | |
var target = ret._content.channel; | |
(function() { | |
if(!res) return; | |
var length = res.length; | |
for(var i=0;i<length;i++) { | |
var obj = JSON.parse(res[i]); | |
var tmp = obj.key.split(':'); | |
var category = tmp[0]; | |
var id = tmp[1]; | |
var type = obj.type || 'video/x-flv'; | |
target[target.length] = { | |
item: [ | |
{ title: obj.title }, | |
{ | |
_name: 'enclosure', | |
_attrs: { | |
url: 'http://192.168.0.4:3279/get/' + obj.key, | |
type: type | |
} | |
}, | |
{ description: obj.desc }, | |
{ pubDate: function() { | |
return new Date(); | |
}} | |
] | |
} | |
} | |
})(); | |
callback(null, jstoxml.toXML(ret, {header: true, indent: ' '})); | |
}); | |
}; | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var spawn = require('child_process').spawn; | |
var fs = require('fs'); | |
var path = require('path'); | |
var mkdirp = require('mkdirp'); | |
var models = require('./models'); | |
function Download(id, callback) { | |
var child = spawn('../perl/download.pl', [ id ]); | |
var filename = ''; | |
child.stdin.end(); | |
child.stdout.on('data', function(chunk) { | |
filename = filename + chunk.toString(); | |
}); | |
child.stdout.on('end', function() { | |
callback(null, filename); | |
}); | |
} | |
// Download('w5hgaNxzORs'); | |
module.exports.download = Download; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment