Created
December 17, 2013 14:54
-
-
Save gear11/8006132 to your computer and use it in GitHub Desktop.
Simple Python proxy server based on Flask and Requests. See: http:/python-proxy-server/gear11.com/2013/12/python-proxy-server/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A simple proxy server. Usage: | |
http://hostname:port/p/(URL to be proxied, minus protocol) | |
For example: | |
http://localhost:8080/p/www.google.com | |
""" | |
from flask import Flask, render_template, request, abort, Response, redirect | |
import requests | |
import logging | |
app = Flask(__name__.split('.')[0]) | |
logging.basicConfig(level=logging.INFO) | |
APPROVED_HOSTS = set(["google.com", "www.google.com", "yahoo.com"]) | |
CHUNK_SIZE = 1024 | |
LOG = logging.getLogger("main.py") | |
@app.route('/<path:url>') | |
def root(url): | |
LOG.info("Root route, path: %s", url) | |
# If referred from a proxy request, then redirect to a URL with the proxy prefix. | |
# This allows server-relative and protocol-relative URLs to work. | |
proxy_ref = proxy_ref_info(request) | |
if proxy_ref: | |
redirect_url = "/p/%s/%s%s" % (proxy_ref[0], url, ("?" + request.query_string if request.query_string else "")) | |
LOG.info("Redirecting referred URL to: %s", redirect_url) | |
return redirect(redirect_url) | |
# Otherwise, default behavior | |
return render_template('hello.html', name=url,request=request) | |
@app.route('/p/<path:url>') | |
def proxy(url): | |
"""Fetches the specified URL and streams it out to the client. | |
If the request was referred by the proxy itself (e.g. this is an image fetch for | |
a previously proxied HTML page), then the original Referer is passed.""" | |
r = get_source_rsp(url) | |
LOG.info("Got %s response from %s",r.status_code, url) | |
headers = dict(r.headers) | |
def generate(): | |
for chunk in r.iter_content(CHUNK_SIZE): | |
yield chunk | |
return Response(generate(), headers = headers) | |
def get_source_rsp(url): | |
url = 'http://%s' % url | |
LOG.info("Fetching %s", url) | |
# Ensure the URL is approved, else abort | |
if not is_approved(url): | |
LOG.warn("URL is not approved: %s", url) | |
abort(403) | |
# Pass original Referer for subsequent resource requests | |
proxy_ref = proxy_ref_info(request) | |
headers = { "Referer" : "http://%s/%s" % (proxy_ref[0], proxy_ref[1])} if proxy_ref else {} | |
# Fetch the URL, and stream it back | |
LOG.info("Fetching with headers: %s, %s", url, headers) | |
return requests.get(url, stream=True , params = request.args, headers=headers) | |
def is_approved(url): | |
"""Indicates whether the given URL is allowed to be fetched. This | |
prevents the server from becoming an open proxy""" | |
host = split_url(url)[1] | |
return host in APPROVED_HOSTS | |
def split_url(url): | |
"""Splits the given URL into a tuple of (protocol, host, uri)""" | |
proto, rest = url.split(':', 1) | |
rest = rest[2:].split('/', 1) | |
host, uri = (rest[0], rest[1]) if len(rest) == 2 else (rest[0], "") | |
return (proto, host, uri) | |
def proxy_ref_info(request): | |
"""Parses out Referer info indicating the request is from a previously proxied page. | |
For example, if: | |
Referer: http://localhost:8080/p/google.com/search?q=foo | |
then the result is: | |
("google.com", "search?q=foo") | |
""" | |
ref = request.headers.get('referer') | |
if ref: | |
_, _, uri = split_url(ref) | |
if uri.find("/") < 0: | |
return None | |
first, rest = uri.split("/", 1) | |
if first in "pd": | |
parts = rest.split("/", 1) | |
r = (parts[0], parts[1]) if len(parts) == 2 else (parts[0], "") | |
LOG.info("Referred by proxy host, uri: %s, %s", r[0], r[1]) | |
return r | |
return None |
I'm still getting "Illegal or missing hexadecimal sequence in chunked-encoding" :(
I tied both with r.raw.data and r.raw.stream(...).
I can get response content via r.text and even construct the response Response but somehow client fails with
The response headers were:
Content-Type: application/vnd.orcid+xml; qs=5; charset=UTF-8'
Transfer-Encoding: chunked
Connection: keep-alive
Content-Encoding: gzip
Any ideas what might have gone wrong?
@gear11 do you have a preferred license for this? My preference is for MIT, but since I'm forking this wanted to pass that by you.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Don't use generate() in line 48 else you get truncated data if content is gzipped encoded.
Either use r.raw.data (and delete generate function) or iterate over r.raw.stream(decode_content=False) instead of r.iter_content()