Created
November 23, 2011 15:59
-
-
Save hisui/1389048 to your computer and use it in GitHub Desktop.
rjquery
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
require "strscan" | |
require "set" | |
# HTMLを解析する | |
class RjStAX | |
attr_reader :scan, :node, :name, :attrs, :text | |
def initialize(html) | |
@scan = StringScanner.new html | |
end | |
def next_node # 1 2 3 4 | |
if @scan.scan %r{^((?:.*?(?:<!.*?>)?)*?)(<\s*(/)?\s*((?:\w+:)?\w+))}m | |
unless @scan[1].empty? | |
@text = @scan[1] | |
@node = :text | |
@scan.pos -= @scan[2].size | |
return true | |
end | |
if @scan[4] | |
@name = @scan[4].downcase | |
if @scan[3].nil? | |
@attrs = {} # 1 2 3 | |
while @scan.scan %r{^\s*([^\s/=>]+)(?:\s*=\s*(['"]?)((?=['"])(?=\2)|(?!\2).*?|[^\s>]*)\2)?}m | |
@attrs[@scan[1]] = @scan[3] || @scan[1] | |
end | |
if @scan.scan %r{^\s*(/)?\s*>}m | |
@node = @scan[1] ? :empty : :open | |
return true | |
end | |
elsif scan.scan /^\s*>/m | |
@node = :close | |
return true | |
end | |
end | |
end | |
unless scan.eos? | |
@text = scan.rest | |
@node = :text | |
scan.pos += @text.size | |
return true | |
end | |
nil | |
end | |
end | |
# 要素の集合を表す | |
class RjNodeSet | |
include Enumerable | |
class ArraySet < RjNodeSet | |
def initialize(list) | |
@list = list | |
end | |
def each(&block) | |
@list.each &block | |
end | |
def to_a | |
@list.dup | |
end | |
end | |
def self.from_a(a) | |
a.size == 1 ? a[0]: ArraySet.new(a) | |
end | |
def method_missing(key, *args) | |
first or raise NoMethodError.new | |
first.__send__ key, args | |
end | |
def find(query) | |
filter " #{query}" | |
end | |
def filter(query) | |
scan = StringScanner.new query | |
def scan.*(pattern) | |
scan pattern | |
end | |
RjNodeSet.from_a eval_query(scan, to_a, /^$/) | |
end | |
# jQuery風のクエリを実行する | |
# 参考: http://semooh.jp/jquery/api/selectors/ | |
def eval_query(scan, list, terminator) | |
#p ["QUERY:", scan.rest, terminator] | |
until scan.scan terminator | |
# 検索範囲を限定 | |
case | |
when scan * /^\s+/; # descendings | |
done = Set.new list | |
list.map! {|node| node.children }.flatten! | |
list.each {|node| | |
next if done.member? node | |
done << node | |
list.concat node.children | |
} | |
when scan * /^\+/; list.map! {|node| node.succ or[]}.flatten! | |
when scan * /^\-/; list.map! {|node| node.prev or[]}.flatten! | |
when scan * /^\>/; list.map! {|node| node.children }.flatten! | |
when scan * /^\~/; list.map! {|node| node.siblings }.flatten! | |
list.uniq! # {|node| node.object_id } | |
end | |
# フィルタリング | |
loop { | |
case | |
when scan * /^\*/; nil | |
when scan * /^\.(\w+)/; list.select! {|node| node.attrs["class"] =~/\b#{scan[1]}\b/ } | |
when scan * /^\#(\w+)/; list.select! {|node| node.attrs["id"] == scan[1] } | |
when scan * /^(\w+)/; list.select! {|node| node.name == scan[1] } | |
when scan * /^:parent/; list.select! {|node| node.children.empty? } | |
when scan * /^:empty/; list.reject! {|node| node.children.empty? } | |
# リスト操作タイプ | |
when scan * /^:first/; list = [list[ 0]] unless list.empty? | |
when scan * /^:last/; list = [list[-1]] unless list.empty? | |
when scan * /^:eq\((.*?)\)/; list = list[scan[1].to_i, 1] || [] | |
when scan * /^:gt\((.*?)\)/; list.slice! scan[1].to_i..-1 | |
when scan * /^:lt\((.*?)\)/; list.slice! 0 ..scan[1].to_i | |
# 子要素フィルタ(要素インデックスと違って1-origin) | |
when scan * /^:nth-child\((\d+)(n(?:\+(\d+))?)?\)/ | |
i = scan[1].to_i | |
j = scan[3].to_i | |
list.select! &(scan[2] ? | |
lambda {|node| (node.index+1) % i == j }: | |
lambda {|node| node.index+1 == i }) | |
# 属性フィルタ | |
when scan * /^\[(\w+)(?:([!^$*]?)=(.*?))?\]/; | |
key = scan[1] | |
val = scan[3] | |
list.select! {|node| node.attrs[key] } | |
case scan[2] | |
when ""; list.select! {|node| node.attrs[key] == val } | |
when "!"; list.select! {|node| node.attrs[key] != val } | |
when "^"; list.select! {|node| node.attrs[key] =~/^#{val}/ } | |
when "$"; list.select! {|node| node.attrs[key] =~ /#{val}$/ } | |
when "*"; list.select! {|node| node.attrs[key].include? val } | |
end | |
# 与えられた文字列を持つ要素を(σ・∀・)σゲッツ!! | |
when scan * /^:contains\((.*?)\)/; | |
list.select! {|node| node.inner_text.include? scan[1] } | |
# 再帰するやつ(適当なのでなまら重い) | |
when scan * /^:not\(/; list -= eval_query(scan, list.dup, /^\)/) | |
when scan * /^:has\(/; | |
pos = scan.pos | |
list.reject! {|node| | |
scan.pos = pos | |
eval_query(scan, node.children.dup, /^\)/).empty? | |
} | |
eval_query(scan, [], /^\)/) if pos == scan.pos | |
else break | |
end | |
} | |
end | |
list | |
end | |
end | |
# HTMLの要素(タグ)を表現 | |
class RjNode < RjNodeSet | |
attr_accessor :name, :attrs, :all, :parent | |
def initialize(name, attrs) | |
def (@attrs = attrs).to_s | |
map {|key, val| "#{key}=\"#{val}\"" }.join " " | |
end | |
@name = name | |
@all = [] # TODO: linked list | |
end | |
def method_missing(key, *args) | |
@attrs[key.to_s] or raise NoMethodError.new | |
end | |
def each | |
yield self | |
end | |
def children | |
@tags_cache ||= @all.reject {|node| node.instance_of? RjTextNode } | |
end | |
def inner_text | |
@text_cache ||= @all.map {|node| node.inner_text }.join | |
end | |
def index | |
parent and parent.children.index(self) | |
end | |
def succ; parent.children[index+1] rescue nil end | |
def prev; parent.children[index-1] rescue nil end | |
def siblings | |
parent ? parent.children[(index+1)..-1]: [] | |
end | |
def <<(child) | |
child.parent.remove child if child.parent | |
child.parent = self | |
@all << child | |
@tags_cache = nil | |
@text_cache = nil | |
end | |
def remove(child) | |
child.parent = nil | |
@all.delete child | |
@tags_cache = nil | |
@text_cache = nil | |
end | |
def to_s | |
html = "<#{name} #{attrs}>\n" | |
lifo = [[0, self]] | |
until lifo.empty? | |
i, node = lifo.last | |
if i >= node.all.size | |
lifo.pop | |
html += " " * lifo.size + "</#{node.name}>\n" | |
next | |
end | |
node = node.all[i] | |
lifo.last[0] += 1 | |
html += " " * lifo.size | |
if node.all.empty? | |
html += node.instance_of?(RjTextNode) ? node.text: "<#{node.name} #{node.attrs}/>\n" | |
next | |
end | |
html += "<#{node.name} #{node.attrs}>\n" | |
lifo << [0, node] | |
end | |
html | |
end | |
def inspect | |
"<#{name} #{attrs}> ... </#{name}>" | |
end | |
end | |
class RjTextNode | |
attr_accessor :text, :parent | |
def initialize(text) | |
@text = text | |
end | |
def inner_text | |
@text | |
end | |
def to_s | |
@text | |
end | |
def inspect | |
"text:#{@text.inspect}" | |
end | |
end | |
# HTMLテキストからDOMを生成 | |
def RjQuery(html) | |
parser = RjStAX.new html | |
lifo = [] | |
root = [] | |
while parser.next_node | |
case parser.node | |
when :text then | |
if lifo.last and (text = parser.text.strip) != "" | |
lifo.last << RjTextNode.new(text) | |
end | |
when :open, :empty then | |
node = RjNode.new parser.name, parser.attrs || {} | |
(lifo.last || root) << node | |
lifo << node if parser.node == :open | |
when :close then | |
lifo.slice!(lifo.rindex {|node| parser.name == node.name }..-1) rescue nil | |
end | |
end | |
#puts root[0] | |
RjNodeSet.from_a root | |
end | |
## | |
## ## ここからテスト用 ## | |
## | |
if $0 == __FILE__ | |
page = RjQuery(<<-"__TEXT__") | |
<html lang="ja"> | |
<head> | |
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" /> | |
<meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
<title>WELCOME to Hisui's HOME PAGE!</title> | |
<script src="jquery-1.7.1.js"></script> | |
</head> | |
<body> | |
<h1>Welcome to Underground</h1> | |
<p> | |
中学時代のパソコンの授業でインターネットを使った時 | |
みんなが自分の好きな漫画や野球のページを見てる時に | |
自分だけこれみよがしに2chにつないでAAとかを周りに見せてたこと | |
しかも「このページって何?」って聞かれた時に | |
「ヤバイ奴らの集会所みたいなもん」とか答えたこと | |
さらに友達に2chへの行きかたを教えるためにヤフーで2chって検索させて | |
でてきたリンクをクリックして2chのトップページが表示された瞬間に | |
そいつの耳元で「Welcome to Underground」ってささやいたこと | |
</p> | |
<div> | |
<h3>環境変数一覧</h3> | |
<table id="env"> | |
<tr> <td>キー</td><td>値</td> </tr> | |
<tr> <td>HTTP_ACCEPT_LANGUAGE</td><td>ja,en-US;q=0.8,en;q=0.6</td> </tr> | |
<tr> <td>HTTP_ACCEPT_ENCODING</td><td>gzip,deflate,sdch</td> </tr> | |
<tr> <td>HTTP_USER_AGENT</td><td>Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2</td> </tr> | |
</table> | |
<h3>最近の書き込み</h3> | |
<table id="bbs"> | |
<tbody> | |
<tr> | |
<td>名前</td><td>内容</td> | |
</tr> | |
</tbody> | |
<tbody> | |
<tr colspan="2"><td>荒らしの書き込み</td></tr> | |
<tr><td>山ちゃん</td><td>オラオラオラ!</td></tr> | |
<tr><td>山ちゃん</td><td>オラオラオラ!</td></tr> | |
<tr><td>山ちゃん</td><td>オラオラオラ!</td></tr> | |
<tr colspan="2"><td>ユーザーの書き込み</td></tr> | |
<tr><td id="name_1"><a href="mailto:[email protected]">名無しさん</a></a></td><td>ヾ(*´∀`*)ノキャッキャ</td></tr> | |
<tr><td id="name_2"><a href="mailto:[email protected]">イケメン</a></td><td>よろしく~</td></tr> | |
<tr><td id="name_3"><a href="mailto:[email protected]">たぬき</a></td><td>おなかへった</td></tr> | |
<tr><td id="name_4"><a href="mailto:[email protected]">ボブ</a></td><td>こん!</td></tr> | |
<tr><td id="name_5"><a href="mailto:[email protected]">たぬき</a></td><td>こん~</td></tr> | |
<tr><td id="name_6"><a href="mailto:[email protected]">山ちゃん</a></td><td>オラオラオラ!</td></tr> | |
</tbody> | |
</table> | |
</div> | |
</body> | |
</html> | |
__TEXT__ | |
nodes = page.find("table#bbs tr:has(td:contains(ユーザーの書き込み))~tr:nth-child(2n+1) td[id^=name]>a") | |
nodes.to_a.each{|node| | |
puts "#{node.inner_text} : #{node.attrs["href"]}" | |
} | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment