Created
December 13, 2013 19:28
-
-
Save epitron/73db08093fc8248be3f1 to your computer and use it in GitHub Desktop.
Turn a Rails controller into a web-proxy (web.archive.org-style)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'pp' | |
class ProxyController < ApplicationController | |
# Example headers: | |
# {"GATEWAY_INTERFACE"=>"CGI/1.1", | |
# "PATH_INFO"=>"/proxy/http:/google.com", | |
# "QUERY_STRING"=>"", | |
# "REMOTE_ADDR"=>"127.0.0.1", | |
# "REMOTE_HOST"=>"127.0.0.1", | |
# "REQUEST_METHOD"=>"GET", | |
# "REQUEST_URI"=>"http://localhost:3000/proxy/http://google.com/", | |
# "SCRIPT_NAME"=>"", | |
# "SERVER_NAME"=>"localhost", | |
# "SERVER_PORT"=>"3000", | |
# "SERVER_PROTOCOL"=>"HTTP/1.1", | |
# "SERVER_SOFTWARE"=>"WEBrick/1.3.1 (Ruby/2.0.0/2013-06-27)", | |
# "HTTP_USER_AGENT"=> | |
# "Opera/9.80 (X11; Linux x86_64) Presto/2.12.388 Version/12.16", | |
# "HTTP_HOST"=>"localhost:3000", | |
# "HTTP_ACCEPT"=> | |
# "text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1", | |
# "HTTP_ACCEPT_LANGUAGE"=>"en-US,en;q=0.9", | |
# "HTTP_ACCEPT_ENCODING"=>"gzip, deflate", | |
# "HTTP_COOKIE"=> | |
# "remember_user_token=BAhbB1sGaQZJIiIkMmEkMTAkU2E2NjJYTDIyVUxYYlVXbnhEM3dWTwY6BkVU--66f2f6480b5e1a7e1293959ba4653ee9f1a48311; _hibar_session=Qmhwc1JNSTlhUUsyblEybzFINGV3UGo2ZGNFWXRTS29jaXpqTlorYmQ1dncxcjIxakQ5LzhUSVVvMGx2QVhHMEdhTk1ITnlDSUY3akEwVmRZNk82c004aktHa0ZMVUZ3WEg5SE45SFl2MjhhVHFMTkRXVXhmdjhURWhkZExuMEc3Qk5FbWFld0p1RzRFU0xZYjI3em9Qbkltd2dIYUo3Uy9wRzNZWVkwcGdWc1E4Y3ZUTG1LOVRsdStDcDFrdGowa1Vad2VEUmFKeXQ4QlFvc01NZk1CR0pOcXI4VW8xZ3NmQXIzdVdoZEgrLzN5aGxLY0pDSndvbjFtd3EvVlJuYkttZzlTUzVpZSt3OXFrRWxpRFJNcVZiV0tlOEFBZ1ViWmlNRHBDN2dKUkRZSnprbzdRNHpQNmlkaUE1T2t2d1RrNVpnR3dwUllab0ZsT084dER3SXJ5czNObXdxbmh6djc3dWVQcVkyQVRGai9BRm5weDdLQWloSVFCcDZpQU5jLS1DSlNCd3lPNFAvV3JHVjAyZk8zNnVRPT0%3D--579c8e19bafc03e0f6e8f92639161fe22901ca76", | |
# "HTTP_CACHE_CONTROL"=>"no-cache", | |
# "HTTP_CONNECTION"=>"Keep-Alive", | |
# "HTTP_VERSION"=>"HTTP/1.1", | |
# "REQUEST_PATH"=>"/proxy/http://google.com/", | |
# "ORIGINAL_FULLPATH"=>"/proxy/http://google.com/", | |
# "ORIGINAL_SCRIPT_NAME"=>"", | |
# "ROUTES_25203060_SCRIPT_NAME"=>""} | |
def index | |
# request variables: | |
# referer | |
# base_url | |
# original_fullpath | |
# host | |
# port | |
# fullpath | |
# url | |
# host_with_port | |
# original_url | |
http = ::HTTPClient.new( | |
cookie_file: current_user.cookie_file, | |
verbose: true, | |
follow_redirects: false, | |
raise_exceptions: false, | |
logger: Rails.logger | |
) | |
logger.info "HTTP Referer: #{request.referer}" | |
# binding.pry | |
# logger.info "Rack environment:\n" + request.env.select{|k,v| v =~ /proxy/ }.pretty_inspect | |
if request.original_fullpath =~ %r{^/proxy/(.+)} | |
url = $1 | |
url += "?#{request.query_string}" unless request.query_string.blank? | |
else | |
raise "original_fullpath is broken: #{request.original_fullpath}" | |
end | |
# loop do | |
# puts "* Downloading: #{url}" | |
# page = http.get(url) | |
# puts "* Info: #{page.link_info}" | |
# if page.redirect | |
# puts "* Following redirect: #{page.redirect}" | |
# url = page.redirect | |
# else | |
# break | |
# end | |
# end | |
puts "* Downloading: #{url}" | |
page = http.get(url) | |
puts "* Info: #{page.link_info}" | |
if page.mime_type[/^text\/html/] | |
# # set base url | |
# # head.add_child("<base href=\"#{url}\">") | |
rewrite_urls!(page, url) | |
body = page.parser.to_html | |
else | |
body = page.body | |
end | |
if page.redirect | |
logger.info "redirecting_to: #{page.redirect}" | |
redirect_to proxify_url(url, page.redirect) | |
else | |
render text: body, content_type: page.mime_type, status: page.code | |
end | |
end | |
protected | |
def fix_url(url) | |
url.gsub(" ", "%20") | |
end | |
def proxify_url(current_url, link) | |
current_url = URI.parse(fix_url(current_url)) unless current_url.is_a? URI | |
"/proxy/#{current_url + fix_url(link)}" | |
end | |
# | |
# Rewrite all urls on the page so that they're absolute urls (relative to "base"), | |
# and prefixed by "/proxy" | |
# | |
def rewrite_urls!(page, current_url) | |
# Add a <base href=""> tag | |
# page.at("head").children.before("<base href=\"#{base_url}\">") | |
current_url = URI.parse fix_url(current_url) | |
# Replace other tags... | |
tag_map = { | |
"a" => "href", | |
"img" => "src", | |
"form" => "action", | |
"script" => "src", | |
"link" => "href" | |
} | |
for tagname, attrname in tag_map | |
page.search(tagname).each do |node| | |
if tag_url = node[attrname] | |
tag_url = fix_url(tag_url) | |
begin | |
new_url = proxify_url(current_url, tag_url) | |
node[attrname] = new_url | |
rescue URI::InvalidURIError => e | |
logger.error e.inspect | |
node[attrname] = "javascript:alert('Unable to parse original url: #{tag_url}');" | |
end | |
end | |
end | |
end | |
if meta = page.at("meta") and meta["http-equiv"] == "refresh" | |
#<meta http-equiv="refresh" content="0; url=http://tinyurl.com/pemjju3"> | |
meta["content"] = meta["content"].gsub(/(?<=\burl=)(.+)(?=\s|$)/) do |m| | |
proxify_url current_url, $1 | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment