Created
March 28, 2013 19:48
-
-
Save chriskk/5266228 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/local/bin/ruby | |
## ymHTML - Simple HTML Parser | |
## (c) 2003-2011 yoshidam | |
## You can redistribute it and/or modify it under the same term as Ruby. | |
## | |
## Feb 11, 2011 yoshidam version 0.1.18 HTML5 style charset declaration | |
## Mar 3, 2009 yoshidam version 0.1.16 parseDTD bug fix | |
## Aug 6, 2009 yoshidam version 0.1.15 encoding bug fix | |
## Aug 1, 2009 yoshidam version 0.1.14 Ruby 1.9 | |
## Nov 17, 2007 yoshidam version 0.1.13 Windows-1252 | |
## Oct 30, 2006 yoshidam version 0.1.12 NKF, Iconv | |
## Sep 23, 2006 yoshidam version 0.1.11 comment end bug fix | |
## Apr 12, 2006 yoshidam version 0.1.10 forceHTML option | |
## Mar 7, 2006 yoshidam version 0.1.9 iso-2022-jp bug fix | |
## Nov 15, 2005 yoshidam version 0.1.8 table border | |
## Apr 6, 2004 yoshidam version 0.1.7 InputStream | |
## Mar 10, 2004 yoshidam version 0.1.6 exception, InputStream | |
## Sep 17, 2003 yoshidam version 0.1.5 bug fix | |
## Apr 05, 2003 yoshidam version 0.1.4 | |
## Apr 04, 2003 yoshidam version 0.1.3 | |
## Apr 02, 2003 yoshidam version 0.1.2 | |
## Mar 27, 2003 yoshidam version 0.1.1 | |
## Mar 26, 2003 yoshidam version 0.1.0 | |
module YmHTML | |
VERSION = 0.117 | |
class Error < StandardError | |
end | |
class ParseError < Error | |
end | |
class EncodingError < Error | |
end | |
class Parser | |
HEAD_MISC = "script|style|meta|link|object" | |
HEADING = "h1|h2|h3|h4|h5|h6" | |
LIST = "ul|ol|dir|menu" | |
PREFORMATTED = "pre" | |
FONTSTYLE = "tt|i|b|u|s|strike|big|small" | |
PHRASE = "em|strong|dfn|code|samp|kbd|var|cite|abbr|acronym" | |
SPECIAL = "a|img|applet|object|font|basefont|br|script|map|q|sub|sup|span|bdo|iframe" | |
FORMCTRL = "input|select|textarea|label|button" | |
INLINE = "#{FONTSTYLE}|#{PHRASE}|#{SPECIAL}|#{FORMCTRL}|ins|del" | |
BLOCK = "p|#{HEADING}|#{LIST}|#{PREFORMATTED}|dl|div|center|noscript|noframes|blockquote|form|isindex|hr|table|fieldset|address|ins|del" | |
FLOW = "#{BLOCK}|#{INLINE}" | |
EMPTY = '' | |
ContentList = {} | |
OpenElements = [ | |
## ['omitted tag', 'outer', 'inner'] | |
['html', nil, /^(head|body)$/u], | |
['head', 'html', /^(title|base|#{HEAD_MISC})$/u], | |
['body', 'html', /^(#{BLOCK}|script|ins|del)$/u], | |
['body', 'noframes', /^(#{BLOCK}|script|ins|del)$/u], | |
['tbody', 'table', /^tr$/], | |
## invalid omissions | |
['tr', 'tbody', /^td$/], | |
['dd', 'dl', /^(#{FLOW})$/], | |
['td', 'tr', /^(#{FLOW})$/], | |
['ul', proc {|p| p !~ /^(ul|ol|li)$/}, /^li$/], | |
] | |
HAVE_PCDATA = /^(option|textarea|fieldset|title|#{FONTSTYLE}|#{PHRASE}|sub|sup|span|bdo|font|address|a|p|#{HEADING}|pre|q|dt|label|legend|caption|body|div|center|object|applet|blockquote|ins|del|dd|li|form|button|th|td|iframe|noscript)$/u | |
ATTR_NAME = { 'table' => [ | |
['frame', /^(void|above|below|hsides|lhs|rhs|vsides|box|border)$/ ] | |
] | |
} | |
def self.charCode(c) c.class == String ? c.ord : c end | |
MINUS = charCode(?-) | |
def self.regnew(s, opt = nil) | |
if s.respond_to?(:force_encoding) | |
Regexp.new(s.force_encoding(::Encoding::UTF_8), opt) | |
else | |
Regexp.new(s, opt, 'u') | |
end | |
end | |
private | |
def havePCDATA?(name) | |
name =~ HAVE_PCDATA | |
end | |
def guessOmittedTag(parent, child) | |
OpenElements.each do |e, p, c| | |
if (c.is_a?(Regexp) && child =~ c) || | |
(c.is_a?(Proc) && c.call(child)) | |
if (p .nil? && parent.nil?) || | |
(p.is_a?(String) && p == parent) || | |
(p.is_a?(Proc) && p.call(parent)) | |
return [e] | |
elsif !e.is_a?(Proc) && !c.is_a?(Proc) | |
if ret = guessOmittedTag(parent, e) | |
return ret.push(e) | |
end | |
end | |
end | |
end | |
return nil | |
end | |
def self.setContentList(elements, content) | |
elements.split('|').each do |name| | |
ContentList[name] = regnew("^(#{content})$", nil) | |
end | |
end | |
setContentList(PHRASE, INLINE) | |
# setContentList('body', "#{BLOCK}|script") | |
setContentList('body', FLOW) | |
setContentList('p', INLINE) | |
setContentList('dt', INLINE) | |
setContentList('dd', FLOW) | |
setContentList('li', FLOW) | |
setContentList('option', '') | |
setContentList('thead', 'tr') | |
setContentList('tfoot', 'tr') | |
setContentList('tbody', 'tr') | |
setContentList('colgroup', 'col') | |
setContentList('tr', 'th|td') | |
setContentList('th|td', FLOW) | |
setContentList('head', "title|base|#{HEAD_MISC}") | |
setContentList('html', "head|body|frameset") | |
## empty element | |
setContentList('br|area|link|img|param|hr|input|col|base|meta|basefont|frame|isindex', '') | |
## elements which cannot omit end tag | |
setContentList("#{FONTSTYLE}|#{PHRASE}", INLINE) | |
setContentList('sub|sup|bdo|font', INLINE) | |
setContentList('address', "#{INLINE}|p") | |
setContentList('div|center', FLOW) | |
## change for HTML5 | |
setContentList('a', "#{BLOCK}|#{INLINE.sub(/\ba\|/, '')}") | |
setContentList('map', "#{BLOCK}|area") | |
setContentList('object|applet', "param|#{FLOW}") | |
setContentList(HEADING, INLINE) | |
setContentList('pre', INLINE) | |
setContentList('q', INLINE) | |
setContentList('blockquote|ins|del', FLOW) | |
setContentList('dl', 'dt|dd') | |
setContentList('ol|ul|dir|menu', 'li') | |
setContentList('form', FLOW) | |
setContentList('label', INLINE) | |
setContentList('select', 'optgroup|option') | |
setContentList('optgroup', 'option') | |
setContentList('textarea', '') | |
setContentList('fieldset', "legend|#{FLOW}") | |
setContentList('legend|caption', INLINE) | |
setContentList('button', FLOW) | |
setContentList('table', 'caption|col|colgroup|thead|tfoot|tbody') | |
setContentList('frameset', 'frameset|frame|noframes') | |
setContentList('iframe', FLOW) | |
setContentList('noframes', "body|#{FLOW}") | |
setContentList('title', '') | |
setContentList('style|script', '') | |
setContentList('noscript', FLOW) | |
def normalizeAttrValue(str) | |
str.gsub(/[\x9\r\n]/, ' ') | |
end | |
def numToChar(num) | |
char = [num].pack("U") | |
char.force_encoding(::Encoding::UTF_8) if char.respond_to?(:force_encoding) | |
char | |
end | |
## expand entityRef/charRef in content text | |
def expandRef(text = nil) | |
return '' if text.nil? | |
ret = [] | |
ret.taint if text.tainted? | |
while text =~ /\&[\#0-9a-zA-Z]+\;?/ | |
before = Regexp.last_match.pre_match | |
ref = Regexp.last_match[0] | |
text = Regexp.last_match.post_match | |
ret.push(before) if before != '' | |
if ref =~ /^\&\#([0-9]+);?$/ | |
## Numeric Character Reference (Decimal) | |
ref = numToChar($1.to_i) | |
elsif ref =~ /^\&\#x([0-9a-fA-F]+);?$/ | |
## Numeric Character Reference (Hexadecimal) | |
ref = numToChar($1.hex) | |
elsif !@xhtmlp && ref =~ /^\&\#X([0-9a-fA-F]+);?$/ | |
## Numeric Character Reference (Hexadecimal) | |
ref = numToChar($1.hex) | |
else | |
## Entity Reference | |
# if !checkNameChar(ref.gsub(/\A\&([\#0-9a-zA-Z]+);?\Z/u, '\1')) | |
# raise ParseError.new("illegal entity reference: #{ref.inspect}") | |
# end | |
ref = expandRef(getEntity(ref)) ## expand recursively | |
end | |
ret.push(ref) | |
end ## end of while | |
ret.push(text) if text != '' | |
ret.join('') | |
end | |
## expand entityRef/charRef in attribute value | |
def expandAttrValue(text = nil) | |
return '' if text.nil? | |
ret = [] | |
ret.taint if text.tainted? | |
text = normalizeAttrValue(text) | |
while text =~ /\&[\#0-9a-zA-Z]+\;?/ | |
before = Regexp.last_match.pre_match | |
ref = Regexp.last_match[0] | |
text = Regexp.last_match.post_match | |
ret.push(before) if before != '' | |
if ref =~ /^\&\#([0-9]+);?$/ | |
## Numeric Character Reference (Decimal) | |
ref = numToChar($1.to_i) | |
elsif ref =~ /^\&\#x([0-9a-fA-F]+);?$/ | |
## Numeric Character Reference (Hexadecimal) | |
ref = numToChar($1.hex) | |
elsif !@xhtmlp && ref =~ /^\&\#X([0-9a-fA-F]+);?$/ | |
## Numeric Character Reference (Hexadecimal) | |
ref = numToChar($1.hex) | |
else | |
## Entity Reference | |
# if !checkNameChar(ref.gsub(/\A\&([\#0-9a-zA-Z]+);?\Z/u, '\1')) | |
# raise ParseError.new("illegal entity reference: #{ref.inspect}") | |
# end | |
ref = expandAttrValue(getEntity(ref)) ## expand recursively | |
end | |
ret.push(ref) | |
end ## end of while | |
ret.push(text) if text != '' | |
ret.join('') | |
end | |
def registerEntity(entname, entval) | |
if @entity[entname].nil? | |
@entity[entname] = entval | |
end | |
end | |
def getEntity(entname) | |
name = entname.sub(/^\&?([\#0-9a-zA-Z]+)\;?$/u, '\1') | |
if !@entity[name].nil? | |
return @entity[name] | |
end | |
if @xhtmlp | |
raise ParseError.new("undeclarated entity reference: #{entname.inspect}") | |
end | |
entname.sub(/&/, '&') | |
end | |
def initialize(encoding = nil) | |
@content = '' | |
@pos = -1 | |
@entity = {} | |
@encoding = encoding ? encoding.downcase : nil | |
@forceHTML = false | |
@xhtmlp = false | |
@eliminateWhiteSpace = false | |
registerEntity("quot", """) | |
registerEntity("amp", "&") | |
registerEntity("lt", "<") | |
registerEntity("gt", ">") | |
registerEntity("apos", "'") | |
registerEntity("nbsp", " ") | |
registerEntity("iexcl", "¡") | |
registerEntity("cent", "¢") | |
registerEntity("pound", "£") | |
registerEntity("curren", "¤") | |
registerEntity("yen", "¥") | |
registerEntity("brvbar", "¦") | |
registerEntity("sect", "§") | |
registerEntity("uml", "¨") | |
registerEntity("copy", "©") | |
registerEntity("ordf", "ª") | |
registerEntity("laquo", "«") | |
registerEntity("not", "¬") | |
registerEntity("shy", "­") | |
registerEntity("reg", "®") | |
registerEntity("macr", "¯") | |
registerEntity("deg", "°") | |
registerEntity("plusmn", "±") | |
registerEntity("sup2", "²") | |
registerEntity("sup3", "³") | |
registerEntity("acute", "´") | |
registerEntity("micro", "µ") | |
registerEntity("para", "¶") | |
registerEntity("middot", "·") | |
registerEntity("cedil", "¸") | |
registerEntity("sup1", "¹") | |
registerEntity("ordm", "º") | |
registerEntity("raquo", "»") | |
registerEntity("frac14", "¼") | |
registerEntity("frac12", "½") | |
registerEntity("frac34", "¾") | |
registerEntity("iquest", "¿") | |
registerEntity("Agrave", "À") | |
registerEntity("Aacute", "Á") | |
registerEntity("Acirc", "Â") | |
registerEntity("Atilde", "Ã") | |
registerEntity("Auml", "Ä") | |
registerEntity("Aring", "Å") | |
registerEntity("AElig", "Æ") | |
registerEntity("Ccedil", "Ç") | |
registerEntity("Egrave", "È") | |
registerEntity("Eacute", "É") | |
registerEntity("Ecirc", "Ê") | |
registerEntity("Euml", "Ë") | |
registerEntity("Igrave", "Ì") | |
registerEntity("Iacute", "Í") | |
registerEntity("Icirc", "Î") | |
registerEntity("Iuml", "Ï") | |
registerEntity("ETH", "Ð") | |
registerEntity("Ntilde", "Ñ") | |
registerEntity("Ograve", "Ò") | |
registerEntity("Oacute", "Ó") | |
registerEntity("Ocirc", "Ô") | |
registerEntity("Otilde", "Õ") | |
registerEntity("Ouml", "Ö") | |
registerEntity("times", "×") | |
registerEntity("Oslash", "Ø") | |
registerEntity("Ugrave", "Ù") | |
registerEntity("Uacute", "Ú") | |
registerEntity("Ucirc", "Û") | |
registerEntity("Uuml", "Ü") | |
registerEntity("Yacute", "Ý") | |
registerEntity("THORN", "Þ") | |
registerEntity("szlig", "ß") | |
registerEntity("agrave", "à") | |
registerEntity("aacute", "á") | |
registerEntity("acirc", "â") | |
registerEntity("atilde", "ã") | |
registerEntity("auml", "ä") | |
registerEntity("aring", "å") | |
registerEntity("aelig", "æ") | |
registerEntity("ccedil", "ç") | |
registerEntity("egrave", "è") | |
registerEntity("eacute", "é") | |
registerEntity("ecirc", "ê") | |
registerEntity("euml", "ë") | |
registerEntity("igrave", "ì") | |
registerEntity("iacute", "í") | |
registerEntity("icirc", "î") | |
registerEntity("iuml", "ï") | |
registerEntity("eth", "ð") | |
registerEntity("ntilde", "ñ") | |
registerEntity("ograve", "ò") | |
registerEntity("oacute", "ó") | |
registerEntity("ocirc", "ô") | |
registerEntity("otilde", "õ") | |
registerEntity("ouml", "ö") | |
registerEntity("divide", "÷") | |
registerEntity("oslash", "ø") | |
registerEntity("ugrave", "ù") | |
registerEntity("uacute", "ú") | |
registerEntity("ucirc", "û") | |
registerEntity("uuml", "ü") | |
registerEntity("yacute", "ý") | |
registerEntity("thorn", "þ") | |
registerEntity("yuml", "ÿ") | |
registerEntity("fnof", "ƒ") | |
registerEntity("Alpha", "Α") | |
registerEntity("Beta", "Β") | |
registerEntity("Gamma", "Γ") | |
registerEntity("Delta", "Δ") | |
registerEntity("Epsilon", "Ε") | |
registerEntity("Zeta", "Ζ") | |
registerEntity("Eta", "Η") | |
registerEntity("Theta", "Θ") | |
registerEntity("Iota", "Ι") | |
registerEntity("Kappa", "Κ") | |
registerEntity("Lambda", "Λ") | |
registerEntity("Mu", "Μ") | |
registerEntity("Nu", "Ν") | |
registerEntity("Xi", "Ξ") | |
registerEntity("Omicron", "Ο") | |
registerEntity("Pi", "Π") | |
registerEntity("Rho", "Ρ") | |
registerEntity("Sigma", "Σ") | |
registerEntity("Tau", "Τ") | |
registerEntity("Upsilon", "Υ") | |
registerEntity("Phi", "Φ") | |
registerEntity("Chi", "Χ") | |
registerEntity("Psi", "Ψ") | |
registerEntity("Omega", "Ω") | |
registerEntity("alpha", "α") | |
registerEntity("beta", "β") | |
registerEntity("gamma", "γ") | |
registerEntity("delta", "δ") | |
registerEntity("epsilon", "ε") | |
registerEntity("zeta", "ζ") | |
registerEntity("eta", "η") | |
registerEntity("theta", "θ") | |
registerEntity("iota", "ι") | |
registerEntity("kappa", "κ") | |
registerEntity("lambda", "λ") | |
registerEntity("mu", "μ") | |
registerEntity("nu", "ν") | |
registerEntity("xi", "ξ") | |
registerEntity("omicron", "ο") | |
registerEntity("pi", "π") | |
registerEntity("rho", "ρ") | |
registerEntity("sigmaf", "ς") | |
registerEntity("sigma", "σ") | |
registerEntity("tau", "τ") | |
registerEntity("upsilon", "υ") | |
registerEntity("phi", "φ") | |
registerEntity("chi", "χ") | |
registerEntity("psi", "ψ") | |
registerEntity("omega", "ω") | |
registerEntity("thetasym", "ϑ") | |
registerEntity("upsih", "ϒ") | |
registerEntity("piv", "ϖ") | |
registerEntity("bull", "•") | |
registerEntity("hellip", "…") | |
registerEntity("prime", "′") | |
registerEntity("Prime", "″") | |
registerEntity("oline", "‾") | |
registerEntity("frasl", "⁄") | |
registerEntity("weierp", "℘") | |
registerEntity("image", "ℑ") | |
registerEntity("real", "ℜ") | |
registerEntity("trade", "™") | |
registerEntity("alefsym", "ℵ") | |
registerEntity("larr", "←") | |
registerEntity("uarr", "↑") | |
registerEntity("rarr", "→") | |
registerEntity("darr", "↓") | |
registerEntity("harr", "↔") | |
registerEntity("crarr", "↵") | |
registerEntity("lArr", "⇐") | |
registerEntity("uArr", "⇑") | |
registerEntity("rArr", "⇒") | |
registerEntity("dArr", "⇓") | |
registerEntity("hArr", "⇔") | |
registerEntity("forall", "∀") | |
registerEntity("part", "∂") | |
registerEntity("exist", "∃") | |
registerEntity("empty", "∅") | |
registerEntity("nabla", "∇") | |
registerEntity("isin", "∈") | |
registerEntity("notin", "∉") | |
registerEntity("ni", "∋") | |
registerEntity("prod", "∏") | |
registerEntity("sum", "∑") | |
registerEntity("minus", "−") | |
registerEntity("lowast", "∗") | |
registerEntity("radic", "√") | |
registerEntity("prop", "∝") | |
registerEntity("infin", "∞") | |
registerEntity("ang", "∠") | |
registerEntity("and", "∧") | |
registerEntity("or", "∨") | |
registerEntity("cap", "∩") | |
registerEntity("cup", "∪") | |
registerEntity("int", "∫") | |
registerEntity("there4", "∴") | |
registerEntity("sim", "∼") | |
registerEntity("cong", "≅") | |
registerEntity("asymp", "≈") | |
registerEntity("ne", "≠") | |
registerEntity("equiv", "≡") | |
registerEntity("le", "≤") | |
registerEntity("ge", "≥") | |
registerEntity("sub", "⊂") | |
registerEntity("sup", "⊃") | |
registerEntity("nsub", "⊄") | |
registerEntity("sube", "⊆") | |
registerEntity("supe", "⊇") | |
registerEntity("oplus", "⊕") | |
registerEntity("otimes", "⊗") | |
registerEntity("perp", "⊥") | |
registerEntity("sdot", "⋅") | |
registerEntity("lceil", "⌈") | |
registerEntity("rceil", "⌉") | |
registerEntity("lfloor", "⌊") | |
registerEntity("rfloor", "⌋") | |
registerEntity("lang", "〈") | |
registerEntity("rang", "〉") | |
registerEntity("loz", "◊") | |
registerEntity("spades", "♠") | |
registerEntity("clubs", "♣") | |
registerEntity("hearts", "♥") | |
registerEntity("diams", "♦") | |
## registerEntity("quot", """) | |
## registerEntity("amp", "&#38;") | |
## registerEntity("lt", "&#60;") | |
## registerEntity("gt", ">") | |
## registerEntity("apos", "'") | |
registerEntity("OElig", "Œ") | |
registerEntity("oelig", "œ") | |
registerEntity("Scaron", "Š") | |
registerEntity("scaron", "š") | |
registerEntity("Yuml", "Ÿ") | |
registerEntity("circ", "ˆ") | |
registerEntity("tilde", "˜") | |
registerEntity("ensp", " ") | |
registerEntity("emsp", " ") | |
registerEntity("thinsp", " ") | |
registerEntity("zwnj", "‌") | |
registerEntity("zwj", "‍") | |
registerEntity("lrm", "‎") | |
registerEntity("rlm", "‏") | |
registerEntity("ndash", "–") | |
registerEntity("mdash", "—") | |
registerEntity("lsquo", "‘") | |
registerEntity("rsquo", "’") | |
registerEntity("sbquo", "‚") | |
registerEntity("ldquo", "“") | |
registerEntity("rdquo", "”") | |
registerEntity("bdquo", "„") | |
registerEntity("dagger", "†") | |
registerEntity("Dagger", "‡") | |
registerEntity("permil", "‰") | |
registerEntity("lsaquo", "‹") | |
registerEntity("rsaquo", "›") | |
registerEntity("euro", "€") | |
end | |
## parse token | |
def nextToken | |
token = '' | |
if @xhtmlp | |
elementpat = /[\<\>\[\]\=\/]/u | |
else | |
elementpat = /[\<\>\[\]\=]/u | |
end | |
while !(c = @content[@pos, 1]).nil? | |
if c == '' | |
## EOF | |
return token if token != '' | |
return nil | |
elsif c == '-' && token == '<!-' | |
## Comment | |
commentpos = @content.index(/--[ \t\n\r]>/u, @pos + 1) | |
raise ParseError.new("comment parse error") unless commentpos | |
@content[commentpos..-1] =~ /--[ \t\n\r]>/u | |
len = $&.length | |
token += @content[@pos, commentpos - @pos + len] | |
@pos = commentpos + len | |
return token | |
elsif c == '-' && token == '-' | |
## Comment in decl | |
commentpos = @content.index(/--/u, @pos + 1) | |
raise ParseError.new("comment parse error") unless commentpos | |
token += @content[@pos, commentpos - @pos + 2] | |
@pos = commentpos + 2 | |
return token | |
elsif c == '?' && token == '<' | |
## PI | |
pipos = @content.index("?>", @pos + 1) | |
raise ParseError.new("PI parse error") unless pipos | |
token += @content[@pos, pipos - @pos + 2] | |
@pos = pipos + 2 | |
return token | |
elsif c =~ /[ \t\n\r]/u | |
## White Space | |
return token if token != '' | |
@pos += 1 | |
next | |
elsif c =~ elementpat | |
## Element | |
return token if token != '' | |
if c == '=' || c == '>' | |
@pos += 1 | |
return c | |
end | |
@pos += 1 | |
token = c | |
next | |
## Literal | |
elsif token == '' && (c == '"' || c == "'") | |
quotpos = @content.index(c, @pos + 1) | |
raise ParseError.new("literal parse error") unless quotpos | |
token = @content[@pos, quotpos - @pos + 1] | |
@pos = quotpos + 1 | |
return token | |
## Others | |
else | |
token += c | |
@pos += 1 | |
next | |
end | |
end | |
nil | |
end | |
def checkNameChar(str) | |
str =~ /\A([^\W0-9]|:)[\w\.\-:]*\Z/u | |
end | |
## parse DTD | |
def parseDTD(dtd) | |
@pos -= dtd.length | |
start = @pos | |
if (token = nextToken) != '<!DOCTYPE' | |
raise ParseError.new("DOCTYPE parse error: #{token.inspect}") | |
end | |
doctype = nextToken | |
if (token = nextToken) == 'SYSTEM' | |
extid = nextToken | |
token = nextToken | |
elsif token == 'PUBLIC' | |
pubid = nextToken | |
token = nextToken | |
if token != '>' && token != '[' | |
extid = token | |
token = nextToken | |
end | |
end | |
## skip internel DTD subset | |
if token == '[' | |
while (token = nextToken) | |
if token == ']' | |
token = nextToken | |
break | |
end | |
end | |
end | |
if token != '>' | |
raise ParseError.new("DOCTYPE parse error") | |
end | |
if !@forceHTML && pubid =~ /^[\"\']-\/\/W3C\/\/DTD XHTML / | |
@xhtmlp = true | |
end | |
# p [doctype, pubid, extid] | |
@content[start + 1, @pos - start - 2] ## chop the first '<' and | |
## the last '>' | |
end | |
def isEmptyElement(name) | |
return false if @xhtmlp | |
name =~ /^(br|area|link|img|param|hr|input|col|base|meta|basefont|frame|isindex)$/ | |
end | |
def isCdataElement(name) | |
return false if @xhtmlp | |
name =~ /^(style|script)$/ | |
end | |
## parse Element start tag | |
def parseElementStartTag(elem) | |
empty = nil | |
attrs = {} | |
rawattrs = {} | |
## rewind | |
@pos -= elem.length | |
start = @pos | |
name = nextToken | |
if !checkNameChar(name) | |
## rollback | |
@pos = start | |
return nil | |
# raise ParseError.new("illegal element name: #{name.inspect}") | |
end | |
name.downcase! unless @xhtmlp | |
token = nextToken | |
while !token.nil? | |
break if token == '>' | |
if token == '/' ## empty element tag | |
token = nextToken | |
if token != '>' | |
## rollback | |
@pos = start | |
return nil | |
# raise ParseError.new("element parse error") | |
end | |
empty = 1 | |
break | |
end | |
attrname = token | |
# if !checkNameChar(attrname) | |
# raise ParseError.new("illegal attribute name: #{attrname.inspect}") | |
# end | |
attrname.downcase! unless @xhtmlp | |
token = nextToken | |
if token != '=' | |
raise ParseError.new("attribute parse error") if @xhtmlp | |
attrvalue = attrname | |
if ATTR_NAME[name] | |
for n, v in ATTR_NAME[name] | |
if attrvalue =~ v | |
attrname = n | |
break | |
end | |
end | |
end | |
else | |
attrvalue = nextToken | |
token = nextToken | |
end | |
if attrvalue !~ /\A([\'\"]?)([\w\W]*)\1\Z/ | |
raise ParseError.new("attribute parse error: #{attrvalue.inspect}") | |
end | |
# if attrs.include?(attrname) | |
# raise ParseError.new("dupulicate attribute: #{attrname.inspect}") | |
# end | |
attrs[attrname] = expandAttrValue($2) | |
if @eliminateWhiteSpace | |
attrs[attrname].gsub!(/[ \x9\n]+/, ' ') | |
attrs[attrname].gsub!(/\A +| +\z/, '') | |
end | |
rawattrs[attrname] = attrvalue | |
end | |
empty = 1 if isEmptyElement(name) | |
[name, attrs, empty, rawattrs] | |
end | |
def expect(key, include = 0) | |
token = nil | |
pos = @content.index(key, @pos) | |
if pos.nil? | |
token = @content[@pos..-1] | |
@pos = -1 | |
return token | |
elsif key.is_a?(Regexp) && include > 0 | |
@content[pos..-1] =~ key | |
include = $&.length | |
end | |
token = @content[@pos, pos - @pos + include] | |
@pos = pos + include | |
token | |
end | |
def parseTag(e = nil) | |
c = @content[@pos, 1] | |
if !e.nil? | |
token = expect(Parser.regnew(e, 'i'), 0) | |
return [:CDATA, token] | |
elsif c == '<' | |
## Markup | |
token = expect(">", 1) | |
if token[-1, 1] != '>' | |
return [:MARKUP, nil] | |
end | |
return [:MARKUP, token] | |
else | |
## CharData | |
return [:PCDATA, expect("<")] | |
end | |
end | |
def normalizeLineBreak(str) | |
return nil unless str | |
if str.respond_to?(:encoding) | |
org_enc = str.encoding | |
str.force_encoding(::Encoding::ASCII_8BIT) | |
str.gsub(/\x0d\x0a|\x0d/, "\x0a") | |
str.force_encoding(org_enc) | |
else | |
str.gsub(/\x0d\x0a|\x0d/u, "\x0a") | |
end | |
end | |
def checkContent(parent, child) | |
return true unless ContentList.include?(child) | |
return true unless ContentList.include?(parent) | |
return true if child =~ ContentList[parent] | |
false | |
end | |
def doPreParseProcessing | |
## normalize line break | |
@content[@pos..-1] = normalizeLineBreak(@content[@pos..-1]) | |
end | |
public | |
attr_accessor :eliminateWhiteSpace | |
attr_accessor :forceHTML | |
ZenkakuChar = [0x3000, MINUS, 0x9fff, 0xf900, MINUS, 0xfaff] | |
IgnorableSpaces = regnew("([#{ZenkakuChar.pack('U*')}])\n+([#{ZenkakuChar.pack('U*')}])") | |
def parse(content, &block) | |
@content = content | |
if !content.is_a?(InputStream) && content.respond_to?(:read) | |
## IO stream | |
@content = InputStream.new(content) | |
elsif content.is_a?(String) && content.respond_to?(:encoding) | |
## Ruby1.9 String | |
if @content.encoding != ::Encoding::UTF_8 | |
@content = @content.encode(::Encoding::UTF_8) | |
end | |
if [email protected]_encoding? | |
raise EncodingError.new("invalid encoding") | |
end | |
end | |
@block = block | |
@pos = 0 | |
estack = [] | |
if @content.nil? | |
return 0 | |
end | |
if @encoding && @content.is_a?(InputStream) | |
@content.setEncoding(@encoding) | |
end | |
lastContent = nil | |
nextContent = nil | |
while @pos >= 0 | |
ttype, part = parseTag(nextContent) | |
oldpart = part | |
nextContent = nil | |
if part.nil? | |
raise ParseError.new("unexpected EOF") | |
elsif ttype == :PCDATA | |
## #PCDATA | |
if !lastContent | |
doPreParseProcessing | |
part = normalizeLineBreak(part) | |
end | |
## unknown encoding non-ascii characters | |
if part.respond_to?(:encoding) and | |
!part.ascii_only? and part.encoding == ::Encoding::ASCII_8BIT | |
raise ParseError.new("character encoding has not been specified") | |
end | |
lastContent = :PCDATA | |
if estack.length == 0 | |
next if part =~ /\A[ \x9\r\n]*\Z/u | |
raise ParseError.new("cdata must be in document element: #{part.inspect}") | |
end | |
if !havePCDATA?(estack[-1]) | |
next if part =~ /\A[ \x9\r\n]*\Z/u | |
# raise ParseError.new("cannot have #PCDATA in #{estack[-1]}") | |
end | |
part = expandRef(part) | |
if @eliminateWhiteSpace && estack[-1] != 'pre' | |
part.gsub!(IgnorableSpaces, '\1\2') | |
part.gsub!(/[ \x9\n]+/, ' ') | |
part.gsub!(/\A +| +\z/, '') | |
end | |
if part != '' | |
if block_given? | |
@block.call(:CDATA, nil, part) | |
else | |
character(part) | |
end | |
end | |
next | |
elsif ttype == :CDATA | |
lastContent = :CDATA | |
## CDATA | |
if block_given? | |
@block.call(:CDATA, nil, part) | |
else | |
character(part) | |
end | |
next | |
else | |
first = part[1] | |
if first == ?? && part =~ /\A<\?xml[ \t\n\r\?]/u && lastContent.nil? | |
## XML Declaration | |
if (part =~ /\A<\?xml([ \t\n\r]+version[ \t\n\r]*=[ \t\n\r]*(['"])([a-zA-Z0-9_.:\-]+)\2)?([ \t\n\r]+encoding[ \t\n\r]*=[ \t\n\r]*(['"])(.*?)\5)?([ \t\n\r]+standalone[ \t\n\r]*=[ \t\n\r]*(['"])(yes|no)\8)?[ \t\n\r]*\?>/u) != 0 | |
raise ParseError.new("illegal XML declaration") | |
end | |
@xhtmlp = true if !@forceHTML | |
version = $3 | |
encoding = $6 | |
standalone = $9 | |
if !version | |
raise ParseError.new("invalid XML declaration") | |
end | |
if version != '1.0' && version != '1.1' | |
raise ParseError.new("version #{version} not supported") | |
end | |
if block_given? | |
@block.call(:XML_DECL, nil, [version, encoding, standalone]) | |
else | |
xmlDecl(version, encoding, standalone) | |
end | |
if encoding && @content.is_a?(InputStream) | |
@content.setEncoding(encoding.downcase) | |
end | |
next | |
end | |
## pre-parse processing after XML Declaration | |
if !lastContent || | |
lastContent == :XML_DECL | |
doPreParseProcessing | |
part = normalizeLineBreak(part) | |
next if lastContent | |
end | |
if first == ?? | |
## Processing Instruction | |
lastContent = :PI | |
if part !~ /\?>\Z/u | |
part += expect("?>", 2) | |
if part !~ /\?>\Z/u | |
raise ParseError.new("processing instruction data expected") | |
end | |
end | |
part = part[2..-3] ## strip "<?" and "?>" | |
part =~ /\A([^ \t\n\r]+)([ \t\n\r]+(.*))?\Z/mu | |
name = $1 | |
data = $3.to_s | |
if @xhtmlp && name =~ /\Axml\z/i | |
raise ParseError.new("illegal PI name: #{name.inspect}") | |
end | |
##!!! chack name | |
if block_given? | |
@block.call(:PI, name, data) | |
else | |
processingInstruction(name, data) | |
end | |
next | |
elsif first == ?! | |
if part =~ /\A<!--/u | |
## Comment | |
lastContent = :COMMENT | |
if part !~ /--([ \t\n\r]*>)\Z/u | |
part += expect(/--[ \t\n\r]*>/u, 3) | |
end | |
if @xhtmlp && part !~ /-->\Z/u | |
raise ParseError.new("comment must end with \"-->\"") | |
end | |
part =~ /\A<!--([\s\S]*)--[ \t\n\r]*>\Z/u | |
part = $1 | |
if @xhtmlp && part =~ /--/u | |
raise ParseError.new("comment must not contain '--'") | |
end | |
if block_given? | |
@block.call(:COMMENT, nil, part) | |
else | |
comment(part) | |
end | |
next | |
elsif part =~ /\A<!DOCTYPE/u | |
## Document type declaration | |
lastContent = :DTD | |
part = parseDTD(oldpart) | |
## dtdHandler(part) if !part.nil? | |
next | |
elsif @xhtmlp && part =~ /\A<!\[CDATA\[/u | |
if estack.length == 0 | |
raise ParseError.new("cdata must be in document element") | |
end | |
## CDATA Section | |
lastContent = :CDATA | |
if part !~ /\]\]>\Z/u | |
part += expect("]]>", 3) | |
if part !~ /\]\]>\Z/u | |
raise ParseError.new("\"<![CDATA[\" must end with \"]]>\"") | |
end | |
end | |
part = part[9..-4] | |
if block_given? | |
@block.call(:CDATA, nil, part) | |
else | |
character(part) | |
end | |
next | |
else | |
# raise ParseError.new("unknown markup: #{part.inspect}") | |
end | |
else | |
## Element | |
lastContent = :ELEMENT | |
name = nil | |
attrs = nil | |
rawattrs = nil | |
empty = nil | |
endTagP = nil | |
if part =~ /\A<\// | |
## element end tag | |
name = part | |
name.downcase! unless @xhtmlp | |
name.sub!(/\A<\/([^ \t\n\r]+)[ \t\n\r]*>\Z/u, '\1') | |
endTagP = 1 | |
## end tag in document root | |
if estack.length == 0 | |
if @xhtmlp | |
raise ParseError.new("not opened end tag: #{name.inspect}") | |
end | |
## !!! INVALID !!! | |
estack.push(name) | |
if block_given? | |
@block.call(:START_ELEM, name, {}) | |
else | |
startElement(name, {}) | |
end | |
end | |
## unmatch end tag | |
if name != (e = estack.pop) | |
if @xhtmlp | |
raise ParseError.new("not opened end tag: #{name.inspect}") | |
end | |
if !estack.include?(name) | |
estack.push(e) | |
if block_given? | |
@block.call(:START_ELEM, name, {}) | |
else | |
startElement(name, {}) | |
end | |
else | |
## insert omitted end tags | |
while true | |
break if e == name | |
if block_given? | |
@block.call(:END_ELEM, e, nil) | |
else | |
endElement(e) | |
end | |
e = estack.pop | |
end | |
end | |
end | |
elsif part =~ /\A<[a-zA-Z]+/ | |
## element start tag | |
name, attrs, empty, rawattrs = parseElementStartTag(oldpart[1..-1]) | |
if name.nil? | |
if block_given? | |
@block.call(:CDATA, nil, '<') | |
else | |
character('<') | |
end | |
next | |
end | |
if @xhtmlp | |
# if !checkContent(estack[-1], name) | |
# raise ParseError.new("illegal element #{name.inspect} in #{estack[-1].inspect}") | |
# end | |
else | |
if (tags = guessOmittedTag(estack[-1], name)) | |
## insert omitted start tags | |
tags.each do |n| | |
estack.push(n) | |
if block_given? | |
@block.call(:START_ELEM, n, {}) | |
else | |
startElement(n, {}) | |
end | |
end | |
else | |
## insert omitted end tags | |
while !checkContent(estack[-1], name) && | |
estack[-1] !~ /^(html|body)$/ | |
if block_given? | |
@block.call(:END_ELEM, estack[-1], nil) | |
else | |
endElement(estack[-1]) | |
end | |
estack.pop | |
end | |
end | |
## insert omitted start tags | |
if (tags = guessOmittedTag(estack[-1], name)) | |
tags.each do |n| | |
estack.push(n) | |
if block_given? | |
@block.call(:START_ELEM, n, {}) | |
else | |
startElement(n, {}) | |
end | |
end | |
end | |
end | |
estack.push(name) if !empty | |
## change encoding | |
if @content.is_a?(InputStream) && name == 'meta' && | |
((attrs['http-equiv'] =~ /^content-type$/iu && | |
attrs['content'] =~ /\bcharset[ \t\n\r]*=[ \t\n\r]*([a-zA-Z0-9\-_]+)\b/) || | |
attrs['charset'] =~ /^([a-zA-Z0-9\-_]+)$/) ## HTML5 | |
@content.setEncoding($1.downcase) | |
end | |
if block_given? | |
@block.call(:START_ELEM, name, attrs) | |
else | |
startElement(name, attrs) | |
end | |
else | |
## illegal markup | |
part = expandRef(part) | |
if block_given? | |
@block.call(:CDATA, nil, part) | |
else | |
character(part) | |
end | |
next | |
end | |
if empty || endTagP | |
if block_given? | |
@block.call(:END_ELEM, name, nil) | |
else | |
endElement(name) | |
end | |
next | |
elsif isCdataElement(name) | |
## style and script element | |
nextContent = "</#{name}" | |
next | |
end | |
next | |
end | |
# p [ttype, part] | |
end | |
end | |
if estack.length > 0 | |
if @xhtmlp | |
raise ParseError.new("unclosed element: #{estack.pop.inspect}") | |
end | |
estack.reverse_each do |name| | |
if block_given? | |
@block.call(:END_ELEM, name, nil) | |
else | |
endElement(name) | |
end | |
end | |
end | |
end | |
## stop to parse | |
def stop | |
@pos = -1 | |
end | |
def getPos | |
@pos | |
end | |
def getLine | |
@content[0, @pos].count("\n") | |
end | |
## | |
## Default handler | |
## | |
protected | |
def character(text) | |
end | |
def xmlDecl(version, encoding, standalone) | |
end | |
def processingInstruction(name, data) | |
end | |
def comment(data) | |
end | |
def startElement(name, attrs) | |
end | |
def endElement(name) | |
end | |
end | |
class InputStream | |
attr_reader :uri | |
CP1252_TO_UCS = | |
[0x20ac, 0xfffd, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021, | |
0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0xfffd, 0x017d, 0xfffd, | |
0xfffd, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, | |
0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0xfffd, 0x017e, 0x0178] | |
MAX_LENGTH = 0x1fffffff | |
class ::Integer | |
def ord | |
self | |
end | |
end | |
def combineSurrogatePair(ary) | |
i = 0 | |
len = ary.length | |
ret = [] | |
while i < len | |
c = ary[i] | |
if c >= 0xd800 && c <= 0xdbff && | |
i + 1 < len && ary[i+1] >= 0xdc00 && ary[i+1] <= 0xdfff | |
i += 1 | |
low = ary[i] | |
c = (((c & 1023)) << 10 | (low & 1023)) + 0x10000 | |
end | |
ret << c | |
i += 1 | |
end | |
ret | |
end | |
private :combineSurrogatePair | |
def initialize(stream, encoding = nil, &block) | |
@encoding = encoding ? encoding.downcase : nil | |
@block = block | |
@autodetectedEncoding = nil | |
@uri = nil | |
if stream.is_a?(String) | |
@content = stream | |
else | |
@content = stream.read(MAX_LENGTH) | |
end | |
taintp = @content.tainted? | |
@content.force_encoding(::Encoding::ASCII_8BIT) if @content.respond_to?(:force_encoding) | |
## auto encoding detection | |
if @encoding.nil? | |
if @content.length >= 4 | |
if @content[0].ord == 0xff && @content[1].ord == 0xfe && | |
@content[2].ord != 0 | |
## UTF-16 (LE) | |
@content = combineSurrogatePair(@content[2..-1].unpack('v*')).pack('U*') | |
@autodetectedEncoding = 'utf-16' | |
elsif @content[0].ord == 0xfe && @content[1].ord == 0xff && | |
@content[3].ord != 0 | |
## UTF-16 (BE) | |
@content = combineSurrogatePair(@content[2..-1].unpack('n*')).pack('U*') | |
@autodetectedEncoding = 'utf-16' | |
elsif @content[0..3] == "<\0?\0" | |
## UTF-16LE | |
@content = combineSurrogatePair(@content.unpack('v*')).pack('U*') | |
@autodetectedEncoding = 'utf-16le' | |
elsif @content[0..3] == "\0<\0?" | |
## UTF-16BE | |
@content = combineSurrogatePair(@content.unpack('n*')).pack('U*') | |
@autodetectedEncoding = 'utf-16be' | |
elsif @content[0].ord == 0xff && @content[1].ord == 0xfe && | |
@content[2].ord == 0 && @content[3].ord == 0 | |
## UTF-32 (LE) | |
@content = @content[4..-1].unpack('V*').pack('U*') | |
@autodetectedEncoding = 'utf-32' | |
elsif @content[0].ord == 0 && @content[1].ord == 0 && | |
@content[2].ord == 0xfe && @content[3].ord == 0xff | |
## UTF-32 (BE) | |
@content = @content[4..-1].unpack('N*').pack('U*') | |
@autodetectedEncoding = 'utf-32' | |
elsif @content[0..7] == "<\0\0\0?\0\0\0" | |
## UTF-32LE | |
@content = @content.unpack('V*').pack('U*') | |
@autodetectedEncoding = 'utf-32le' | |
elsif @content[0..7] == "\0\0\0<\0\0\0?" | |
## UTF-32BE | |
@content = @content.unpack('N*').pack('U*') | |
@autodetectedEncoding = 'utf-32be' | |
elsif @content[0].ord == 0xef && @content[1].ord == 0xbb && | |
@content[2].ord == 0xbf | |
## UTF-8 (BOM) | |
@content = @content[3..-1] | |
@autodetectedEncoding = 'utf-8' | |
elsif @content[0..3] == "\x4c\x6f\xa7\x94" || | |
@content[0..3] == "\x4c\x6f\xb7\x75" || | |
@content[0..3] == "\x4c\x6f\xab\x73" | |
raise EncodingError.new("EBCDIC not supported") | |
end | |
end | |
elsif @encoding == 'us-ascii' | |
## no conversion | |
elsif @encoding == 'utf-8' | |
## delete BOM | |
if @content[0].ord == 0xef && @content[1].ord == 0xbb && | |
@content[2].ord == 0xbf | |
## UTF-8 (BOM) | |
@content = @content[3..-1] | |
end | |
elsif @encoding == 'utf-16' | |
if @content.length >= 4 | |
if @content[0].ord == 0xff && @content[1].ord == 0xfe && | |
@content[2].ord != 0 | |
## UTF-16 (LE) | |
@content = combineSurrogatePair(@content[2..-1].unpack('v*')).pack('U*') | |
elsif @content[0].ord == 0xfe && @content[1].ord == 0xff && | |
@content[3].ord != 0 | |
## UTF-16 (BE) | |
@content = combineSurrogatePair(@content[2..-1].unpack('n*')).pack('U*') | |
else | |
raise EncodingError.new("illegal UTF-16 sequence") | |
end | |
end | |
elsif @encoding == 'utf-16le' | |
@content = combineSurrogatePair(@content.unpack('v*')).pack('U*') | |
elsif @encoding == 'utf-16be' | |
@content = combineSurrogatePair(@content.unpack('n*')).pack('U*') | |
elsif @encoding == 'utf-32' | |
if @content[0].ord == 0xff && @content[1].ord == 0xfe && | |
@content[2].ord == 0 && @content[3].ord == 0 | |
## UTF-32 (LE) | |
@content = @content[4..-1].unpack('V*').pack('U*') | |
elsif @content[0].ord == 0 && @content[1].ord == 0 && | |
@content[2].ord == 0xfe && @content[3].ord == 0xff | |
## UTF-32 (BE) | |
@content = @content[4..-1].unpack('N*').pack('U*') | |
else | |
raise EncodingError.new("illegal UTF-32 sequence") | |
end | |
elsif @encoding == 'utf-32le' | |
@content = @content.unpack('V*').pack('U*') | |
elsif @encoding == 'utf-32be' | |
@content = @content.unpack('N*').pack('U*') | |
else | |
@content = unknownEncoding(@encoding, @content) | |
end | |
@content.force_encoding(::Encoding::UTF_8) if @content.respond_to?(:force_encoding) | |
@content.taint if taintp | |
end | |
def setEncoding(encoding) | |
return if @encoding ## already set | |
if @autodetectedEncoding && @autodetectedEncoding != encoding | |
raise EncodingError.new("encoding does not match auto detected encoding (#{@autodetectedEncoding}): #{encoding.inspect}") | |
end | |
@encoding = encoding | |
if encoding == 'utf-8' || encoding == 'us-ascii' || | |
encoding == @autodetectedEncoding | |
## do not convert | |
else | |
if @content.respond_to?(:force_encoding) | |
@content.force_encoding(::Encoding::ASCII_8BIT) | |
@content = unknownEncoding(encoding, @content) | |
@content.force_encoding(::Encoding::UTF_8) | |
else | |
@content = unknownEncoding(encoding, @content) | |
end | |
end | |
if @content.respond_to?(:valid_encoding?) && [email protected]_encoding? | |
raise EncodingError.new("invalid encoding") | |
end | |
end | |
def self._getURIBase(uri = nil) | |
uri =~ /^(.*?\/?)[^\/]*$/ | |
$1 | |
end | |
def self._getURIHost(uri) | |
uri =~ /^((https?|ftp|file):\/\/[^\/]*\/?).*$/ | |
$1 | |
end | |
def self._catURI(baseuri, uri) | |
baseuri = baseuri.to_s | |
if uri =~ /^([a-zA-Z]+):/ | |
uri | |
elsif uri =~/^\// | |
host = _getURIHost(baseuri) | |
host =~ /^(.*?)\/?$/ | |
$1.to_s + uri | |
else | |
base = _getURIBase(baseuri) | |
# base =~ /^(.*?)\/?$/ | |
base + uri | |
end | |
end | |
def setURI(uri) | |
@uri = uri | |
end | |
def getURI | |
@uri | |
end | |
def getURIBase | |
self.class._getURIBase(@uri) | |
end | |
def self.openFile(file, encoding = nil, &block) | |
ret = self.new(open(file), encoding, &block) | |
ret.setURI(file) | |
ret | |
end | |
begin | |
require 'open-uri'; | |
@@FETCH_CMD = proc {|uri| open(uri) } | |
rescue LoadError | |
@@FETCH_CMD = '/usr/bin/curl -s' | |
end | |
def self.setURIResolver(cmd) | |
@@FETCH_CMD = cmd | |
end | |
def self.openURI(uri, base = nil, encoding = nil, &block) | |
uri = _catURI(base, uri) | |
if @@FETCH_CMD.is_a?(Proc) | |
ret = self.new(@@FETCH_CMD.call(uri), encoding, &block) | |
else | |
if uri =~ /^(https?|ftp|file):/ | |
ret = self.new(open("|#{@@FETCH_CMD} '#{uri}'"), encoding, &block) | |
else | |
ret = self.new(open(uri), encoding, &block) | |
end | |
end | |
ret.setURI(uri) | |
ret | |
end | |
## index, length, [] and []= treat @content as ASCII-8BIT | |
## if @encoding is not fixed. | |
def index(pat, start) | |
return nil if @content.nil? | |
if @encoding || [email protected]_to?(:encoding) | |
return @content.index(pat, start) | |
else | |
org_enc = @content.encoding | |
@content.force_encoding(::Encoding::ASCII_8BIT) | |
ret = @content.index(pat, start) | |
@content.force_encoding(org_enc) if org_enc | |
ret | |
end | |
end | |
def length | |
return 0 if @content.nil? | |
if @encoding || [email protected]_to?(:encoding) | |
return @content.length | |
else | |
org_enc = @content.encoding | |
@content.force_encoding(::Encoding::ASCII_8BIT) | |
ret = @content.length | |
@content.force_encoding(org_enc) if org_enc | |
ret | |
end | |
end | |
def [](pos, len = nil) | |
return nil if @content.nil? | |
if @encoding || [email protected]_to?(:encoding) | |
if len.nil? | |
return @content[pos] | |
else | |
return @content[pos, len] | |
end | |
else | |
org_enc = @content.encoding | |
@content.force_encoding(::Encoding::ASCII_8BIT) | |
ret = nil | |
if len.nil? | |
ret = @content[pos] | |
else | |
ret = @content[pos, len] | |
end | |
@content.force_encoding(org_enc) if org_enc | |
##ret.force_encoding(org_enc) if org_enc | |
ret | |
end | |
end | |
def []=(pos, value1, value2 = nil) | |
return if @content.nil? | |
if @encoding || [email protected]_to?(:encoding) | |
if value2.nil? | |
@content[pos] = value1 | |
else | |
@content[pos, value1] = value2 | |
end | |
@content | |
else | |
org_enc = @content.encoding | |
@content.force_encoding(::Encoding::ASCII_8BIT) | |
value1.force_encoding(::Encoding::ASCII_8BIT) | |
if value2.nil? | |
@content[pos] = value1 | |
else | |
value2.force_encoding(::Encoding::ASCII_8BIT) | |
@content[pos, value1] = value2 | |
end | |
@content.force_encoding(org_enc) if org_enc | |
end | |
@content | |
end | |
def unknownEncoding(encoding, content) | |
## try Ruby 1.9 String#encode | |
begin | |
if content.respond_to?(:force_encoding) | |
content.force_encoding(encoding) | |
return content.encode(::Encoding::UTF_8) | |
end | |
rescue ArgumentError | |
end | |
## try iconv | |
begin | |
require 'iconv' | |
return Iconv.iconv('UTF-8', encoding, content).join('') | |
rescue LoadError,Iconv::InvalidEncoding | |
end | |
## other conversion libraries | |
case encoding | |
when 'euc-jp' | |
begin | |
require 'nkf' | |
if NKF::UTF8 | |
return NKF.nkf('-Ewm0x', content) | |
end | |
rescue LoadError,NameError | |
end | |
begin | |
require 'uconv' | |
return Uconv.euctou8(content) | |
rescue LoadError | |
end | |
when 'shift_jis' | |
begin | |
require 'nkf' | |
if NKF::UTF8 | |
return NKF.nkf('-Swm0x', content) | |
end | |
rescue LoadError,NameError | |
end | |
begin | |
require 'uconv' | |
return Uconv.sjistou8(content) | |
rescue LoadError | |
end | |
when 'iso-2022-jp' | |
begin | |
require 'nkf' | |
if NKF::UTF8 | |
return NKF.nkf('-Jwm0x', content) | |
end | |
rescue LoadError,NameError | |
end | |
begin | |
require 'uconv' | |
require 'nkf' | |
return Uconv.euctou8(NKF.nkf('-Jem0x', content)) | |
rescue LoadError | |
end | |
when 'iso-8859-1' | |
return @content.gsub(/([\x80-\xff])/n) {|m| [m[0].ord].pack('U') } | |
when 'windows-1252' | |
return @content.gsub(/([\x80-\xff])/n) {|m| | |
m[0].ord < 0xa0 ? | |
[CP1252_TO_UCS[m[0].ord - 0x80]].pack('U') : | |
[m[0].ord].pack('U') | |
} | |
else | |
if @block | |
return @block.call(encoding, content) | |
end | |
end | |
raise EncodingError.new("unknown encoding: #{encoding.inspect}") | |
end | |
private :unknownEncoding | |
end | |
end | |
## Sample | |
## | |
## ruby -rymhtml -e urls <HTMLfile> | |
def urls | |
if ARGV.length == 0 | |
stream = YmHTML::InputStream.new($<) | |
elsif /^(https?|ftp|file):/ =~ ARGV[0] | |
stream = YmHTML::InputStream.openURI(ARGV[0]) | |
else | |
stream = YmHTML::InputStream.openFile(ARGV[0]) | |
end | |
parser = YmHTML::Parser.new | |
parser.eliminateWhiteSpace = true | |
def parser.startElement(n, d) | |
case n | |
when 'link', 'a' | |
if d['href'] | |
p([n, YmHTML::InputStream._catURI($base, d['href'])]) | |
end | |
when 'img' | |
if d['src'] | |
p([n, YmHTML::InputStream._catURI($base, d['src'])]) | |
end | |
end | |
end | |
$base = stream.getURIBase | |
parser.parse(stream) | |
end | |
if $0 == __FILE__ | |
$OPT_e = nil | |
begin | |
require 'optparse' | |
ARGV.options do |o| | |
o.banner << ' <HTMLfile>' | |
o.on('-e', '--encoding ENCODING', | |
'force input character encoding') do |arg| | |
$OPT_e = arg | |
end | |
o.on('-h', '--forceHTML', | |
'force HTML mode even if XHTML') do |arg| | |
$OPT_h = arg | |
end | |
o.parse! | |
end | |
rescue LoadError | |
require 'parsearg' | |
$USAGE = 'print "Usage: #{$0} [-h] [-e <encoding>] <HTMLfile>\n"' | |
parseArgs(0, nil, 'h', 'e:') | |
end | |
## YmHTML::InputStream.setURIResolver("wget -O - -o /dev/null") | |
if ARGV.length == 0 | |
stream = YmHTML::InputStream.new($<, $OPT_e) | |
elsif /^(https?|ftp|file):/ =~ ARGV[0] | |
stream = YmHTML::InputStream.openURI(ARGV[0], $OPT_e) | |
else | |
stream = YmHTML::InputStream.openFile(ARGV[0], $OPT_e) | |
end | |
parser = YmHTML::Parser.new | |
## eliminate white spaces (without content of PRE element) | |
parser.eliminateWhiteSpace = true | |
parser.forceHTML = true if $OPT_h | |
parser.parse(stream) do |t, n, d| | |
if defined?(::Encoding) | |
n = n.encode(Encoding.default_external, :undef=>:replace) if n | |
if d.kind_of?(Hash) | |
d_ = {} | |
d.each do |k, v| | |
k = k.encode(Encoding.default_external, :undef=>:replace) | |
v = v.encode(Encoding.default_external, :undef=>:replace) if v | |
d_[k] = v | |
end | |
d = d_ | |
elsif d.kind_of?(String) | |
d = d.encode(Encoding.default_external, :undef=>:replace) | |
end | |
end | |
p([t, n, d]) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment