Created
June 24, 2014 14:26
-
-
Save takuya/546c78ef5cba9d09456d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# coding : utf-8 | |
require 'mechanize' | |
class Mechanize::Page | |
def embed_body | |
self.embed_images | |
self.embed_style | |
self.embed_script | |
self.search("/").to_s | |
end | |
def embed_script(page=nil,base_uri=nil) | |
page = @mech.page unless page | |
base_uri = @mech.page.uri unless base_uri | |
page.search("script[src]").each{|e| | |
begin | |
uri = URI.join(base_uri, e.attr("src")) | |
@mech.get uri | |
#e.remove | |
script_text = @mech.page.body.toutf8 | |
contents = Base64.encode64(script_text).gsub(/\n|\r/, "") | |
#head = page.search("head").first | |
#head.add_child("\n<script type='text/javascript' src='data:;base64,#{contents}' ></script>\n\n") | |
base64_str = "data:;base64,#{contents}" | |
e["src"] = base64_str | |
@mech.history.clear | |
@mech.history.push page, base_uri | |
rescue Net::HTTPNotFound, Mechanize::ResponseCodeError => e404 | |
$stderr.puts e404.backtrace if $DEBUG | |
puts "404 エラー出たっぽい: #{uri}" | |
next | |
rescue => e | |
$stderr.puts "uri err occured. => '#{u}'" if $DEBUG | |
$stderr.puts e.backtrace if $DEBUG | |
raise e if $DEGUB | |
end | |
} | |
end | |
def remove_script(page=nil) | |
page = @mech.page unless page | |
base_uri = @mech.page.uri unless base_uri | |
page.search("script").each{|e| e.remove} | |
page.search("a").each{|e| | |
e["href"] = "#noscript" if e.attr("href") =~ /script/i | |
e.attributes.keys.each{|name| e.remove_attribute(name) if name =~ /^on/i } | |
} | |
end | |
def embed_css_url(css_text) | |
page = @mech.page | |
base_uri = page.uri | |
css = css_text | |
css = css.lines.map{|line| | |
css_line = line | |
if line =~ %r|url\s*\(| then | |
line =~ %r|url\(([^\)]+)\)| | |
ret = $1 | |
next unless ret | |
embed_url = ret.gsub( /"|'/, "" ) | |
next if embed_url =~/^data/ | |
href = self.parse_uri(base_uri, embed_url.to_s) | |
begin | |
@mech.get href | |
content_type = @mech.page.header["content-type"] | |
contents = @mech.page.body | |
contents = contents.toutf8 if content_type =~ /^text/ | |
contents = Base64.encode64(contents).gsub(/\n|\r/, "") | |
line = line.gsub( ret.to_s, "\n'data:#{content_type};base64,#{contents}'\n" ) | |
@mech.history.clear | |
@mech.history.push page, base_uri | |
rescue => e | |
$stderr.puts "uri err occured. => '#{href}'" if $DEBUG | |
$stderr.puts e.backtrace if $DEBUG | |
raise e if $DEBUG | |
end | |
end | |
line | |
} | |
css = css.join | |
css | |
end | |
def embed_style_import(css,page=nil,base_uri=nil) | |
page = @mech.page unless page | |
base_uri = page.uri unless base_uri # import 呼び出し元のCSSのURLが必要 | |
css = css.lines.map{|line| | |
if line=~/@import/i | |
line = line.gsub( /@import/, "") | |
line = line.gsub( /'|"|;/ , "") | |
line = line.strip | |
line = line.gsub(%r|url\(([^\)]+)\)|){ $1 } | |
line = line.strip | |
u = self.parse_uri(base_uri, line) | |
#u = URI.join( base_uri, line ) | |
begin | |
@mech.get u | |
line = @mech.page.body.toutf8 | |
line = self.embed_css_url(line) | |
line += "\n" | |
@mech.history.clear | |
@mech.history.push page, base_uri | |
rescue => e | |
$stderr.puts "uri err occured. => '#{u}'" | |
$stderr.puts e.backtrace | |
raise e | |
end | |
end | |
line | |
}.join.toutf8 | |
end | |
def embed_style(page=nil,base_uri=nil) | |
page = @mech.page unless page | |
base_uri = @mech.page.uri unless base_uri | |
page.search("style").each{|e| | |
css = e.text | |
css = self.embed_css_url(css) | |
e.content = css | |
} | |
page.search("link[rel*=stylesheet][href]").each{|e| | |
begin | |
u = self.parse_uri(base_uri, e.attr("href")) | |
@mech.get u | |
e.remove | |
head = page.search("head").first | |
css = @mech.page.body.lines.reject{|line| line=~/@charset/i }.join.toutf8 | |
css = css.gsub( /\/\*(?:(?!\*\/).)*\*\//m , ""); | |
css = self.embed_style_import(css) | |
css = self.embed_css_url(css) | |
#css.gsub!("\n", " ") | |
head.add_child("\n<style type='text/css'>\n\n#{css}\n\n</style>\n") | |
@mech.history.clear | |
@mech.history.push page, base_uri | |
rescue => e | |
$stderr.puts "uri err occured. => '#{u}'" if $DEBUG | |
$stderr.puts e.backtrace if $DEBUG | |
raise e if $DEBUG | |
end | |
} | |
end | |
def embed_images(page=nil,base_uri=nil) | |
page = @mech.page unless page | |
base_uri = @mech.page.uri unless base_uri | |
page.search("img[src],input[src]").each{|e| | |
next unless e.attr("src") | |
u = URI.join(base_uri, e.attr("src")) | |
begin | |
@mech.get u | |
e["src"] = "data:#{@mech.page['content-type']};base64,#{Base64.encode64(@mech.page.body)}" | |
@mech.history.clear | |
@mech.history.push page, base_uri | |
rescue Net::HTTPNotFound, Mechanize::ResponseCodeError => e404 | |
$stderr.puts e404.backtrace if $DEBUG | |
puts "404 エラー出たっぽい: #{u}" | |
next | |
rescue => e | |
$stderr.puts "uri err occured. => '#{u}'" if $DEBUG | |
$stderr.puts e.backtrace if $DEBUG | |
raise e if $DEBUG | |
end | |
} | |
end | |
def parse_uri( base_uri, href ) | |
unless( href =~ /^http/ ) then | |
u = URI.join(base_uri, href) | |
else | |
u = href.split("?") | |
query = URI.unescape( u[1..(u.size-1)].join("")) | |
query = URI.escape(query) | |
u = u[0] + "?" + query | |
u = URI.parse(u) | |
end | |
return u | |
end | |
end | |
#$DEBUG=true | |
url = ARGV.shift | |
unless url then | |
puts "Usage #{__FILE__} url " | |
puts " exmaple 1 : #{__FILE__} http://example.com " | |
puts " example 2 : #{__FILE__} file://Users/takuya/Desktop/hoge.html " | |
exit | |
end | |
m = Mechanize.new | |
m.get(url) | |
m.page.embed_body | |
puts m.page.search("/").first.to_html | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment