Created
February 15, 2014 04:18
-
-
Save yswallow/9014529 to your computer and use it in GitHub Desktop.
htmlの本文のxpathを探すアルゴリズム()
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #usage: xpath = mobilizer(html) | |
| require 'nokogiri' | |
| require 'kconv' | |
| def search_equal(pathes) | |
| return pathes[0] if pathes.size == 1 | |
| pathes.map! { |path| path.split('/') } | |
| fullsize = pathes.size | |
| path = [] | |
| max_size = pathes.max_by { |path| path.size }.size | |
| max_size.times do |i| | |
| h = {} | |
| pathes.size.times do |t| | |
| if h[pathes[t][i]] | |
| h[pathes[t][i]] += 1 | |
| else | |
| h[pathes[t][i]] = 1 | |
| end | |
| end | |
| max_key,times = h.to_a.sort_by { |ary| ary[1] }[0] | |
| if times < fullsize * 0.7 | |
| path = pathes[0][0..(i-1)] | |
| break | |
| end | |
| pathes.delete_if do |path| | |
| path[i] != max_key | |
| end | |
| end | |
| path.join('/') | |
| end | |
| def mobilizer(html) | |
| page = Nokogiri::HTML(html.toutf8,nil,'UTF-8') | |
| ['script','style'].each do |tag| | |
| page.xpath("//#{tag}").remove | |
| end | |
| ['p','pre'].each do |tag| | |
| page.xpath('//' + tag).each do |node| | |
| node.swap node.text | |
| end | |
| end | |
| h = {} | |
| page.xpath('//text()').each do |node| | |
| parent = node.parent | |
| k = 0 | |
| if node.text.size >= 80 | |
| k = node.text.size | |
| else | |
| k = node.text.size / 2 | |
| end | |
| if h[parent.path] | |
| h[parent.path] += k | |
| else | |
| h[parent.path] = k | |
| end | |
| end | |
| pathes = [] | |
| h.each do |path,t| | |
| if t > 100 | |
| puts "#{path}, #{t}" if $DEBUG | |
| pathes << path | |
| end | |
| end | |
| xpath = search_equal pathes | |
| puts xpath if $DEBUG | |
| xpath | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment