Created
August 14, 2009 21:32
-
-
Save teramako/168118 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/lib/xulrunner/xpcshell | |
const USAGE = <><![CDATA[ | |
XULRunner付属のxpcshellで動くJavaScript | |
引数のURLのHTML文字列を取得しパースして、再度文字列化して出力するもの | |
Usage: xpcshell curl.js URL | |
]]></>.toString(); | |
const Cc = Components.classes; | |
const Ci = Components.interfaces; | |
var ios = Cc["@mozilla.org/network/io-service;1"].getService(Ci.nsIIOService); | |
var suh = Cc["@mozilla.org/feed-unescapehtml;1"].getService(Ci.nsIScriptableUnescapeHTML); | |
var parser = Cc["@mozilla.org/xmlextras/domparser;1"].createInstance(Ci.nsIDOMParser); | |
var ser = Cc["@mozilla.org/xmlextras/xmlserializer;1"].createInstance(Ci.nsIDOMSerializer); | |
var xslt = Cc["@mozilla.org/document-transformer;1?type=xslt"].createInstance(Ci.nsIXSLTProcessor); | |
function main(args){ | |
var url = args[0]; | |
if (!url){ | |
usage(); | |
quit(); | |
} | |
var htmlstr = httpGet(url); | |
// UTF8octetに変換 | |
htmlstr = unescape(encodeURIComponent(htmlstr)); | |
var title = getTitleFromHTMLString(htmlstr); | |
var doc = createHTMLDocument(<title>{title}</title>); | |
var html = parseHTML(htmlstr, doc, url); | |
doc.body.appendChild(html); | |
var xhtmlstr = tag2LowerCase(ser.serializeToString(doc)); | |
print(xhtmlstr); | |
} | |
function usage(){ | |
print(USAGE); | |
} | |
function httpGet(url){ | |
/* | |
var req = Cc["@mozilla.org/xmlextras/xmlhttprequest;1"].createInstance(Ci.nsIXMLHttpRequest); | |
req.open("GET", url, false); | |
//req.overrideMimeType("text/html;charset=UTF-8"); | |
req.send(null); | |
//print(req.getAllResponseHeaders()); | |
return req.responseText; | |
*/ | |
// XMLHttpRequestでも良いけど、Content-Typeのcharsetを取れるios.newChannelで | |
var channel = ios.newChannel(url, 0, null); | |
var stream = channel.open(); | |
if (channel instanceof Ci.nsIHttpChannel && channel.responseStatus != 200){ | |
return ""; | |
} | |
var charset = channel.contentCharset || "UTF-8"; | |
var iconv = Cc["@mozilla.org/intl/converter-input-stream;1"].createInstance(Ci.nsIConverterInputStream); | |
iconv.init(stream, charset, 1024, Ci.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER); | |
try { | |
var str = {};; | |
var htmlstr = ""; | |
var count; | |
while (iconv.readString(4096, str) != 0){ | |
htmlstr += str.value; | |
} | |
} catch(e){ | |
print(e); | |
} finally { | |
iconv.close(); | |
stream.close(); | |
} | |
return htmlstr; | |
} | |
/** | |
* nsIXSLTProcessorを使ってHTMLDocumentを生成 | |
* | |
* Components.classesByID['{5d0fcdd0-4daa-11d2-b328-00805f8a3859}'].createInstance(Ci.nsIDOMHTMLDocument) | |
* でもHTMLDocumentを生成できる(@see http://twitter.com/nanto_vi/status/3314628619 ) | |
*/ | |
function createHTMLDocument(header){ | |
var xsl = <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"> | |
<xsl:output method="html"/> | |
<xsl:template match="/"> | |
<html> | |
<head>{header}</head> | |
<body/> | |
</html> | |
</xsl:template> | |
</xsl:stylesheet>; | |
var xml = <html xmlns="http://www.w3.org/1999/xhtml"></html>; | |
var doc = parser.parseFromString(xml.toXMLString(), "application/xml"); | |
var styleNode = parser.parseFromString(xsl.toXMLString(), "application/xml"); | |
xslt.importStylesheet(styleNode); | |
var htmldoc = xslt.transformToDocument(doc); | |
htmldoc.QueryInterface(Ci.nsIDOMHTMLDocument); | |
htmldoc.documentElement.QueryInterface(Ci.nsIDOMHTMLElement).QueryInterface(Ci.nsIDOMHTMLHtmlElement); | |
htmldoc.body.QueryInterface(Ci.nsIDOMHTMLElement).QueryInterface(Ci.nsIDOMHTMLBodyElement); | |
return htmldoc; | |
} | |
function parseHTML(htmlstr, doc, baseURL){ | |
htmlstr = relateURL2ABS(htmlstr, baseURL); | |
return suh.parseFragment(htmlstr, false, null, doc.createElement("xml")); | |
} | |
function createURI(url){ | |
return ios.newURI(url, null, null); | |
} | |
/** | |
* 相対パスのhref属性値をURLに変換する | |
*/ | |
function relateURL2ABS(htmlstr, baseURL){ | |
var uri = createURI(baseURL); | |
function replacer(all, quote, url){ | |
return "href=" + quote + uri.resolve(url) + quote; | |
} | |
return htmlstr.replace(/href=(["'])([^"']*)\1/g, replacer) | |
} | |
function tag2LowerCase(htmlstr){ | |
return htmlstr.replace(/<(\/)?([\w]+)([\s\S]*?)>/g, function(all, close, tag, attr){ | |
return "<" + close + tag.toLowerCase() + attr + ">"; | |
}); | |
} | |
function getTitleFromHTMLString(htmlstr){ | |
var title = /<title>([\s\S]+?)<\/title>/.exec(htmlstr); | |
if (title){ | |
return title[1]; | |
} | |
return ""; | |
} | |
function dump(obj){ | |
print("============================================"); | |
print(obj); | |
for (var i in obj){ | |
print(i + ": " + obj[i]); | |
} | |
} | |
main(arguments); | |
// vim: sw=2 ts=2 et |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment