Last active
August 29, 2015 13:57
-
-
Save kamikat/9682070 to your computer and use it in GitHub Desktop.
clawer for kasi-time.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Usage: node kasi.js <id> | |
// Example: | |
// Get lyric from http://www.kasi-time.com/item-55961.html | |
// node kasi.js 55961 | |
var http = require('http'); | |
var vm = require('vm'); | |
var no = process.argv[2]; | |
// To Render HTML to Plain Text | |
var render = (function () { | |
// Focus on horizontial (tab) and vertical (line break) and escaped characters | |
var br0 = ['br', 'hr']; | |
var br1 = ['li', 'p', 'tr', 'table']; | |
var tab = ['td']; | |
// Escape map from entity name to code | |
var esc = { | |
'euro': 8364, 'nbsp': 32, 'quot': 34, 'amp': 38, 'lt': 60, 'gt': 62, | |
'iexcl': 161, 'cent': 162, 'pound': 163, 'curren': 164, 'yen': 165, 'brvbar': 166, 'sect': 167, 'uml': 168, 'copy': 169, 'ordf': 170, 'not': 172, 'shy': 173, 'reg': 174, 'macr': 175, 'deg': 176, 'plusmn': 177, 'sup2': 178, 'sup3': 179, 'acute': 180, 'micro': 181, 'para': 182, 'middot': 183, 'cedil': 184, 'sup1': 185, 'ordm': 186, 'raquo': 187, 'frac14': 188, 'frac12': 189, 'frac34': 190, 'iquest': 191, 'Agrave': 192, 'Aacute': 193, 'Acirc': 194, 'Atilde': 195, 'Auml': 196, 'Aring': 197, 'AElig': 198, 'Ccedil': 199, 'Egrave': 200, 'Eacute': 201, 'Ecirc': 202, 'Euml': 203, 'Igrave': 204, 'Iacute': 205, 'Icirc': 206, 'Iuml': 207, 'ETH': 208, 'Ntilde': 209, 'Ograve': 210, 'Oacute': 211, 'Ocirc': 212, 'Otilde': 213, 'Ouml': 214, 'times': 215, 'Oslash': 216, 'Ugrave': 217, 'Uacute': 218, 'Ucirc': 219, 'Uuml': 220, 'Yacute': 221, 'THORN': 222, 'szlig': 223, 'agrave': 224, 'aacute': 225, 'acirc': 226, 'atilde': 227, 'auml': 228, 'aring': 229, 'aelig': 230, 'ccedil': 231, 'egrave': 232, 'eacute': 233, 'ecirc': 234, 'euml': 235, 'igrave': 236, 'iacute': 237, 'icirc': 238, 'iuml': 239, 'eth': 240, 'ntilde': 241, 'ograve': 242, 'oacute': 243, 'ocirc': 244, 'otilde': 245, 'ouml': 246, 'divide': 247, 'oslash': 248, 'ugrave': 249, 'uacute': 250, 'ucirc': 251, 'uuml': 252, 'yacute': 253, 'thorn': 254 | |
}; | |
var unescape = function (str) { | |
return str.replace(/&([^#;]+);/gi, function (match, entity) { | |
return '&#' + (esc[entity] || entity) + ';'; | |
}).replace(/&#([0-9]+);/gi, function (match, code) { | |
return String.fromCharCode(+code); | |
}); | |
}; | |
return function (html) { | |
var text = ''; | |
var re = /[<]([^<>]+)[>]/gi, match, lastIndex = 0; | |
var nest = []; | |
while ((match = re.exec(html))) { | |
var name = match[1], close = name[0] == '\/', open = !close; | |
name = name.slice(+close).trim(); | |
if (name.slice(-1) == '\/') { | |
open = close = true; | |
name = name.slice(0, name.length - 1).trim(); | |
} | |
text += unescape(html.slice(lastIndex, re.lastIndex - match[0].length)); | |
var last = nest[nest.length - 1]; | |
if (~br0.indexOf(name)) text += '\n'; | |
else if (open) { | |
if (name == last) close = true; | |
nest.push(name); | |
} | |
if (close) { | |
while ((last = nest.pop()) != name) { | |
if (~br1.indexOf(last)) text += '\n'; | |
if (~tab.indexOf(last)) text += '\t'; | |
} | |
if (~br1.indexOf(name)) text += '\n'; | |
if (~tab.indexOf(name)) text += '\t'; | |
} | |
lastIndex = re.lastIndex; | |
} | |
text += unescape(html.slice(lastIndex)); | |
while (nest.length > 0) { | |
var unclosed = nest.pop(); | |
if (~br1.indexOf(unclosed)) text += '\n'; | |
if (~tab.indexOf(unclosed)) text += '\t'; | |
} | |
return text; | |
}; | |
})(); | |
http.get('http://www.kasi-time.com/item_js.php?no=' + no, function (res) { | |
if (res.statusCode != 200) { | |
return console.log('ERROR - HTTP Status ' + res.statusCode); | |
} | |
// Initialize a Buffer to receive message body | |
// MAXIMUM body data size is bound to 1MB | |
var buffer = new Buffer(1024 * 1024), bufptr = 0; | |
res | |
.on('data', function (chunk) { | |
// Use buffer copy concating data chunks | |
bufptr += chunk.copy(buffer, bufptr); | |
}) | |
.on('end', function () { | |
// Get JavaScript code | |
var code = buffer.slice(0, bufptr).toString(); | |
// Compile code to Script object | |
var script = vm.createScript(code, 'item.js'); | |
// Create a Sandbox (Execution Context Object) | |
var sandbox = { | |
document: { | |
write: function (html) { | |
var text = render(html); | |
return console.log(text); | |
} | |
} | |
}; | |
// Execute Script in Sandbox | |
script.runInNewContext(sandbox); | |
}); | |
}).on('error', function (err) { | |
console.log(err); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment