asasuou · October 1, 2015 17:35
diff --git a/worker nginx conf b/worker nginx conf
 	resolver 8.8.8.8;
 	location /video/ {
                if ($request_uri ~ "^/video/(.+?)/.+") {
                        set $upstream_host $1.googlevideo.com;
                        add_header Content-Disposition "attachment; filename=video.mp4;";
                }
                rewrite /video/.+?/(.+)$ /$1 break;
                proxy_buffering off;
                proxy_pass https://$upstream_host;
                proxy_set_header Host $upstream_host;
        }


 	location /image/ {
                if ($request_uri ~ "^/image/(.+?)/.+") {
                        set $upstream_host $1.ytimg.com;
                }
                rewrite /image/.+?/(.+)$ /$1 break;
                proxy_buffering off;
                proxy_pass http://$upstream_host;
                proxy_set_header Host $upstream_host;

        }


        location /photo/ {
                if ($request_uri ~ "^/photo/(.+?)/.+") {
                        set $upstream_host $1.ggpht.com;
                }
                rewrite /photo/.+?/(.+)$ /$1 break;
                proxy_buffering off;
                proxy_pass http://$upstream_host;
                proxy_set_header Host $upstream_host;
        }
diff --git a/youtube-reverse-proxy.py b/youtube-reverse-proxy.py
 #!/usr/bin/env python
 import logging
 import httplib
 import os
 import subprocess
 import socket
 import datetime
 import random
 import signal
 import urllib2
 import urlparse
 import urllib
 import re

 from gevent.wsgi import WSGIServer
 import gevent.monkey
 import gevent.pool
 import gevent
 import redis
 import cgi
 import functools
 import Cookie

 REDIS = redis.StrictRedis()

 gevent.monkey.patch_all(subprocess=True)
 proc_pool = gevent.pool.Pool(size=16)

 LOGGER = logging.getLogger(__name__)

 WORKERS = {
 } # you need to fill this

 LISTEN_IP = ''
 LISTEN_PORT = 3000
 RE_YTIMG_CSS = re.compile(r'/s.ytimg\.com(.*?\.css)', re.IGNORECASE)
 RE_YTIMG_ESC = re.compile(r'\\/([a-zA-Z0-9-]+?)\.ytimg\.com', re.IGNORECASE)
 RE_YTIMG = re.compile(r'/([a-zA-Z0-9-]+?)\.ytimg\.com', re.IGNORECASE)
 RE_GGPHT = re.compile(r'https://([a-zA-Z0-9-]+?)\.ggpht\.com', re.IGNORECASE)
 RE_GOOGLEVIDEO = re.compile(r'/([a-zA-Z0-9-]+?)\.googlevideo\.com', re.IGNORECASE)

 def handle_request(environ, start_response):
    method = environ.get('REQUEST_METHOD')
    try:
        lines = handle(environ, lambda status, headers: start_response(get_http_response(status), headers))
    except:
        path = environ.get('PATH_INFO', '').strip('/')
        LOGGER.exception('failed to handle request: %s %s' % (method, path))
        start_response('500 INTERNAL_SERVER_ERROR', [
            ('Content-Type', 'text/javascript'),
            ('Cache-Control', 'no-cache, no-store, must-revalidate'),
            ('Pragma', 'no-cache'),
            ('Expires', '0')])
        lines = ['Retry in 30 minutes']
    for line in lines:
        yield line


 def get_http_response(code):
    if code not in httplib.responses:
        return code
    return '%s %s' % (code, httplib.responses[code])


 def replace_ytimg_css(match):
    return '/your-reverse-proxy-ip/css/%s' % match.group(1)

 def replace_ytimg_esc(match):
    return '\\/%s\\/image/%s' % (pick_worker()[0], match.group(1))

 def replace_ytimg(match):
    return '/%s/image/%s' % (pick_worker()[0], match.group(1))

 def replace_ggpht(match):
    return 'http://%s/photo/%s' % (pick_worker()[0], match.group(1))

 def replace_googlevideo(worker, match):
    return '/%s/video/%s' % (worker[0], match.group(1))


 def handle(environ, start_response):
    host = 'youtube.com'
    path = environ.get('PATH_INFO', '')
    if '/watch' == path:
        video_id = urlparse.parse_qs(environ['QUERY_STRING'])['v'][0]
        return handle_watch(video_id, environ, start_response)
    if '/watch_videos' == path:
        video_id = urlparse.parse_qs(environ['QUERY_STRING'])['video_ids'][0].split(',')[0]
        return handle_watch(video_id, environ, start_response)
    if path.startswith('/css/'):
        upstream_url = path.replace('/css/', '')
        upstream_url = 'http://s.ytimg.com/%s' % upstream_url
        return handle_css(upstream_url, environ, start_response)
    if path.startswith('/t/'):
        domain = path.replace('/t/', '').replace('.js', '')
        words = 'window.location.href="http://%s";' % domain
        start_response(httplib.OK, [
            ('Content-Type', 'text/javascript'),
            ('Cache-Control', 'no-cache, no-store, must-revalidate'),
            ('Pragma', 'no-cache'),
            ('Expires', '0')])
        return [words]
    if path.startswith('//'):
        start_response(httplib.FOUND, [
            ('Location', 'http:%s' % path)
        ])
        return []
    data = None
    if 'POST' == environ['REQUEST_METHOD']:
        if '/results' == path:
            post_body = cgi.FieldStorage(
                fp=environ['wsgi.input'],
                environ=environ,
                keep_blank_values=True)
            upstream_url = 'http://youtube.com/results?%s' % urllib.urlencode({'search_query': post_body['search_query'].value})
        else:
            data = environ['wsgi.input'].readline()
            upstream_url = 'http://%s%s' % (host, path)
    else:
        upstream_url = 'http://%s%s' % (host, path)
    if environ['QUERY_STRING']:
        upstream_url = '%s?%s' % (upstream_url, environ['QUERY_STRING'])
    LOGGER.info('upstream url: %s' % upstream_url)
    headers = {}
    if environ.get('HTTP_COOKIE'):
        LOGGER.info('cookie is: %s' % environ.get('HTTP_COOKIE'))
        headers['Cookie'] = environ.get('HTTP_COOKIE')
    try:
        response = urllib2.urlopen(urllib2.Request(upstream_url, data=data, headers=headers))
    except urllib2.HTTPError as e:
        start_response(e.code, [(k, v) for k, v in e.hdrs.items()])
        return [e.msg]
    except:
        raise
    headers = []
    for k, v in response.headers.items():
        if 'set-cookie' == k.lower():
            v = v.replace('domain=.youtube.com;', '')
        if 'x-frame' in k.lower():
            continue
        headers.append((k, v))
    start_response(httplib.OK, headers)
    body = response.read()
    body = RE_YTIMG_CSS.sub(replace_ytimg_css, body)
    body = RE_YTIMG_ESC.sub(replace_ytimg_esc, body)
    body = RE_YTIMG.sub(replace_ytimg, body)
    body = RE_GGPHT.sub(replace_ggpht, body)
    # body = body.replace('class="search-form', 'method="POST" class="search-form')
    body = body.replace('class="video-masthead">', 'style="display: none;">')
    body = body.replace('class="branded-page-v2-top-row">', 'style="display: none;">')
    body = body.replace('style="z-index: 1">', 'style="display: none;">')
    body = body.replace('style="z-index: 1;">', 'style="display: none;">')
    body = body.replace('class="premium-yva-unexpanded"', 'style="display: none;"')
    # body = body.replace('id="masthead-search"', 'style="position: relative; padding: 0; margin-top: 3px; overflow: hidden;"')
    body = body.replace('ad.doubleclick.net', '127.0.0.1')
    body = body.replace('www.youtube.com', 'your-reverse-proxy-ip')
    return [body]

 def handle_css(upstream_url, environ, start_response):
    response = urllib2.urlopen(urllib2.Request(upstream_url))
    headers = []
    for k, v in response.headers.items():
        if 'set-cookie' == k.lower():
            v = v.replace('domain=.youtube.com;', '')
            headers.append((k, v))
    start_response(httplib.OK, headers)
    body = response.read()
    body = RE_YTIMG.sub(replace_ytimg, body)
    return [body]

 def handle_watch(video_id, environ, start_response):
    video_url = REDIS.get(video_id)
    if video_url:
        LOGGER.info('%s hit cache' % video_id)
    else:
        LOGGER.info('get url for movie: %s' % video_id)
        try:
            video_url = proc_pool.spawn(get_url, video_id).get()
        except:
            LOGGER.exception('failed to get url')
            start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
            return ['no valid url']
        if 'googlevideo.com' not in video_url:
            LOGGER.error('googlevideo.com not in url: %s' % video_url)
            start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
            return ['no valid url']
        video_url = video_url.replace('https://', 'http://')
        history = set()
        success = False
        for i in range(3):
            worker = pick_worker(history)
            try_url = RE_GOOGLEVIDEO.sub(functools.partial(replace_googlevideo, worker), video_url)
            if is_url_correct(try_url):
                video_url = try_url
                success = True
                break
            # else:
            #     worker[1] = False
        if not success:
            start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
            return ['no valid url']
        REDIS.set(video_id, video_url)
        REDIS.expire(video_id, 60 * 3)
        LOGGER.info('got url for movie: %s %s' % (video_id, video_url))
    start_response(httplib.FOUND, [
        ('Location', video_url),
        ('Content-Type', 'text/plain'),
        ('Cache-Control', 'max-age=180')
    ])
    return ['you can use this link to download the movie']


 def get_url(video_id):
    if '/' in video_id:
        raise Exception('evil')
    return subprocess.check_output(
        'youtube-dl http://www.youtube.com/watch?v=%s -g'
        % video_id, shell=True).strip()


 def serve_forever():
    try:
        server = WSGIServer((LISTEN_IP, LISTEN_PORT), handle_request)
        LOGGER.info('serving HTTP on port %s:%s...' % (LISTEN_IP, LISTEN_PORT))
    except:
        LOGGER.exception('failed to start HTTP server on port %s:%s' % (LISTEN_IP, LISTEN_PORT))
        os._exit(1)
    server.serve_forever()


 def pick_worker(history=()):
    if len(history) >= len(WORKERS):
        raise Exception('no worker')
    server_name = random.choice(WORKERS.keys())
    worker = random.choice(WORKERS[server_name])
    if not worker[1]:
        return pick_worker(set(list(history) + [server_name]))
    return worker


 def is_url_correct(url):
    class NoRedirectHandler(urllib2.HTTPRedirectHandler):
        def http_error_302(self, req, fp, code, msg, headers):
            infourl = urllib.addinfourl(fp, headers, req.get_full_url())
            infourl.status = code
            infourl.code = code
            return infourl

        http_error_300 = http_error_302
        http_error_301 = http_error_302
        http_error_303 = http_error_302
        http_error_307 = http_error_302

    try:
        opener = urllib2.build_opener(NoRedirectHandler())
        response = opener.open(url)
        response.close()
        if 200 == response.code:
            return True
        else:
            LOGGER.error('status code %s for url %s' % (response.code, url))
            return False
    except:
        LOGGER.exception('try url failed: %s' % url)
        return False


 def refresh_workers():
    while True:
        for workers in WORKERS.values():
            for worker in workers:
                worker[1] = is_worker_alive(worker[0])
        LOGGER.info('%s refreshed workers' % datetime.datetime.now())
        gevent.sleep(60 * 60)


 def is_worker_alive(worker_host):
    try:
        urllib2.urlopen('http://%s/image/i1/vi/tLcfAnN2QgY/mqdefault.jpg' % worker_host, timeout=3).close()
        LOGGER.info('%s => OK' % worker_host)
        return True
    except:
        LOGGER.info('%s => FAILURE' % worker_host)
        return False


 signal.signal(signal.SIGINT, lambda signum, fame: os._exit(0))
 logging.basicConfig(level=logging.DEBUG)
 gevent.spawn(refresh_workers)
 serve_forever()
	resolver 8.8.8.8;
	location /video/ {
	if ($request_uri ~ "^/video/(.+?)/.+") {
	set $upstream_host $1.googlevideo.com;
	add_header Content-Disposition "attachment; filename=video.mp4;";
	}
	rewrite /video/.+?/(.+)$ /$1 break;
	proxy_buffering off;
	proxy_pass https://$upstream_host;
	proxy_set_header Host $upstream_host;
	}


	location /image/ {
	if ($request_uri ~ "^/image/(.+?)/.+") {
	set $upstream_host $1.ytimg.com;
	}
	rewrite /image/.+?/(.+)$ /$1 break;
	proxy_buffering off;
	proxy_pass http://$upstream_host;
	proxy_set_header Host $upstream_host;

	}


	location /photo/ {
	if ($request_uri ~ "^/photo/(.+?)/.+") {
	set $upstream_host $1.ggpht.com;
	}
	rewrite /photo/.+?/(.+)$ /$1 break;
	proxy_buffering off;
	proxy_pass http://$upstream_host;
	proxy_set_header Host $upstream_host;
	}
	#!/usr/bin/env python
	import logging
	import httplib
	import os
	import subprocess
	import socket
	import datetime
	import random
	import signal
	import urllib2
	import urlparse
	import urllib
	import re

	from gevent.wsgi import WSGIServer
	import gevent.monkey
	import gevent.pool
	import gevent
	import redis
	import cgi
	import functools
	import Cookie

	REDIS = redis.StrictRedis()

	gevent.monkey.patch_all(subprocess=True)
	proc_pool = gevent.pool.Pool(size=16)

	LOGGER = logging.getLogger(__name__)

	WORKERS = {
	} # you need to fill this

	LISTEN_IP = ''
	LISTEN_PORT = 3000
	RE_YTIMG_CSS = re.compile(r'/s.ytimg\.com(.*?\.css)', re.IGNORECASE)
	RE_YTIMG_ESC = re.compile(r'\\/([a-zA-Z0-9-]+?)\.ytimg\.com', re.IGNORECASE)
	RE_YTIMG = re.compile(r'/([a-zA-Z0-9-]+?)\.ytimg\.com', re.IGNORECASE)
	RE_GGPHT = re.compile(r'https://([a-zA-Z0-9-]+?)\.ggpht\.com', re.IGNORECASE)
	RE_GOOGLEVIDEO = re.compile(r'/([a-zA-Z0-9-]+?)\.googlevideo\.com', re.IGNORECASE)

	def handle_request(environ, start_response):
	method = environ.get('REQUEST_METHOD')
	try:
	lines = handle(environ, lambda status, headers: start_response(get_http_response(status), headers))
	except:
	path = environ.get('PATH_INFO', '').strip('/')
	LOGGER.exception('failed to handle request: %s %s' % (method, path))
	start_response('500 INTERNAL_SERVER_ERROR', [
	('Content-Type', 'text/javascript'),
	('Cache-Control', 'no-cache, no-store, must-revalidate'),
	('Pragma', 'no-cache'),
	('Expires', '0')])
	lines = ['Retry in 30 minutes']
	for line in lines:
	yield line


	def get_http_response(code):
	if code not in httplib.responses:
	return code
	return '%s %s' % (code, httplib.responses[code])


	def replace_ytimg_css(match):
	return '/your-reverse-proxy-ip/css/%s' % match.group(1)

	def replace_ytimg_esc(match):
	return '\\/%s\\/image/%s' % (pick_worker()[0], match.group(1))

	def replace_ytimg(match):
	return '/%s/image/%s' % (pick_worker()[0], match.group(1))

	def replace_ggpht(match):
	return 'http://%s/photo/%s' % (pick_worker()[0], match.group(1))

	def replace_googlevideo(worker, match):
	return '/%s/video/%s' % (worker[0], match.group(1))


	def handle(environ, start_response):
	host = 'youtube.com'
	path = environ.get('PATH_INFO', '')
	if '/watch' == path:
	video_id = urlparse.parse_qs(environ['QUERY_STRING'])['v'][0]
	return handle_watch(video_id, environ, start_response)
	if '/watch_videos' == path:
	video_id = urlparse.parse_qs(environ['QUERY_STRING'])['video_ids'][0].split(',')[0]
	return handle_watch(video_id, environ, start_response)
	if path.startswith('/css/'):
	upstream_url = path.replace('/css/', '')
	upstream_url = 'http://s.ytimg.com/%s' % upstream_url
	return handle_css(upstream_url, environ, start_response)
	if path.startswith('/t/'):
	domain = path.replace('/t/', '').replace('.js', '')
	words = 'window.location.href="http://%s";' % domain
	start_response(httplib.OK, [
	('Content-Type', 'text/javascript'),
	('Cache-Control', 'no-cache, no-store, must-revalidate'),
	('Pragma', 'no-cache'),
	('Expires', '0')])
	return [words]
	if path.startswith('//'):
	start_response(httplib.FOUND, [
	('Location', 'http:%s' % path)
	])
	return []
	data = None
	if 'POST' == environ['REQUEST_METHOD']:
	if '/results' == path:
	post_body = cgi.FieldStorage(
	fp=environ['wsgi.input'],
	environ=environ,
	keep_blank_values=True)
	upstream_url = 'http://youtube.com/results?%s' % urllib.urlencode({'search_query': post_body['search_query'].value})
	else:
	data = environ['wsgi.input'].readline()
	upstream_url = 'http://%s%s' % (host, path)
	else:
	upstream_url = 'http://%s%s' % (host, path)
	if environ['QUERY_STRING']:
	upstream_url = '%s?%s' % (upstream_url, environ['QUERY_STRING'])
	LOGGER.info('upstream url: %s' % upstream_url)
	headers = {}
	if environ.get('HTTP_COOKIE'):
	LOGGER.info('cookie is: %s' % environ.get('HTTP_COOKIE'))
	headers['Cookie'] = environ.get('HTTP_COOKIE')
	try:
	response = urllib2.urlopen(urllib2.Request(upstream_url, data=data, headers=headers))
	except urllib2.HTTPError as e:
	start_response(e.code, [(k, v) for k, v in e.hdrs.items()])
	return [e.msg]
	except:
	raise
	headers = []
	for k, v in response.headers.items():
	if 'set-cookie' == k.lower():
	v = v.replace('domain=.youtube.com;', '')
	if 'x-frame' in k.lower():
	continue
	headers.append((k, v))
	start_response(httplib.OK, headers)
	body = response.read()
	body = RE_YTIMG_CSS.sub(replace_ytimg_css, body)
	body = RE_YTIMG_ESC.sub(replace_ytimg_esc, body)
	body = RE_YTIMG.sub(replace_ytimg, body)
	body = RE_GGPHT.sub(replace_ggpht, body)
	# body = body.replace('class="search-form', 'method="POST" class="search-form')
	body = body.replace('class="video-masthead">', 'style="display: none;">')
	body = body.replace('class="branded-page-v2-top-row">', 'style="display: none;">')
	body = body.replace('style="z-index: 1">', 'style="display: none;">')
	body = body.replace('style="z-index: 1;">', 'style="display: none;">')
	body = body.replace('class="premium-yva-unexpanded"', 'style="display: none;"')
	# body = body.replace('id="masthead-search"', 'style="position: relative; padding: 0; margin-top: 3px; overflow: hidden;"')
	body = body.replace('ad.doubleclick.net', '127.0.0.1')
	body = body.replace('www.youtube.com', 'your-reverse-proxy-ip')
	return [body]

	def handle_css(upstream_url, environ, start_response):
	response = urllib2.urlopen(urllib2.Request(upstream_url))
	headers = []
	for k, v in response.headers.items():
	if 'set-cookie' == k.lower():
	v = v.replace('domain=.youtube.com;', '')
	headers.append((k, v))
	start_response(httplib.OK, headers)
	body = response.read()
	body = RE_YTIMG.sub(replace_ytimg, body)
	return [body]

	def handle_watch(video_id, environ, start_response):
	video_url = REDIS.get(video_id)
	if video_url:
	LOGGER.info('%s hit cache' % video_id)
	else:
	LOGGER.info('get url for movie: %s' % video_id)
	try:
	video_url = proc_pool.spawn(get_url, video_id).get()
	except:
	LOGGER.exception('failed to get url')
	start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
	return ['no valid url']
	if 'googlevideo.com' not in video_url:
	LOGGER.error('googlevideo.com not in url: %s' % video_url)
	start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
	return ['no valid url']
	video_url = video_url.replace('https://', 'http://')
	history = set()
	success = False
	for i in range(3):
	worker = pick_worker(history)
	try_url = RE_GOOGLEVIDEO.sub(functools.partial(replace_googlevideo, worker), video_url)
	if is_url_correct(try_url):
	video_url = try_url
	success = True
	break
	# else:
	# worker[1] = False
	if not success:
	start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
	return ['no valid url']
	REDIS.set(video_id, video_url)
	REDIS.expire(video_id, 60 * 3)
	LOGGER.info('got url for movie: %s %s' % (video_id, video_url))
	start_response(httplib.FOUND, [
	('Location', video_url),
	('Content-Type', 'text/plain'),
	('Cache-Control', 'max-age=180')
	])
	return ['you can use this link to download the movie']


	def get_url(video_id):
	if '/' in video_id:
	raise Exception('evil')
	return subprocess.check_output(
	'youtube-dl http://www.youtube.com/watch?v=%s -g'
	% video_id, shell=True).strip()


	def serve_forever():
	try:
	server = WSGIServer((LISTEN_IP, LISTEN_PORT), handle_request)
	LOGGER.info('serving HTTP on port %s:%s...' % (LISTEN_IP, LISTEN_PORT))
	except:
	LOGGER.exception('failed to start HTTP server on port %s:%s' % (LISTEN_IP, LISTEN_PORT))
	os._exit(1)
	server.serve_forever()


	def pick_worker(history=()):
	if len(history) >= len(WORKERS):
	raise Exception('no worker')
	server_name = random.choice(WORKERS.keys())
	worker = random.choice(WORKERS[server_name])
	if not worker[1]:
	return pick_worker(set(list(history) + [server_name]))
	return worker


	def is_url_correct(url):
	class NoRedirectHandler(urllib2.HTTPRedirectHandler):
	def http_error_302(self, req, fp, code, msg, headers):
	infourl = urllib.addinfourl(fp, headers, req.get_full_url())
	infourl.status = code
	infourl.code = code
	return infourl

	http_error_300 = http_error_302
	http_error_301 = http_error_302
	http_error_303 = http_error_302
	http_error_307 = http_error_302

	try:
	opener = urllib2.build_opener(NoRedirectHandler())
	response = opener.open(url)
	response.close()
	if 200 == response.code:
	return True
	else:
	LOGGER.error('status code %s for url %s' % (response.code, url))
	return False
	except:
	LOGGER.exception('try url failed: %s' % url)
	return False


	def refresh_workers():
	while True:
	for workers in WORKERS.values():
	for worker in workers:
	worker[1] = is_worker_alive(worker[0])
	LOGGER.info('%s refreshed workers' % datetime.datetime.now())
	gevent.sleep(60 * 60)


	def is_worker_alive(worker_host):
	try:
	urllib2.urlopen('http://%s/image/i1/vi/tLcfAnN2QgY/mqdefault.jpg' % worker_host, timeout=3).close()
	LOGGER.info('%s => OK' % worker_host)
	return True
	except:
	LOGGER.info('%s => FAILURE' % worker_host)
	return False


	signal.signal(signal.SIGINT, lambda signum, fame: os._exit(0))
	logging.basicConfig(level=logging.DEBUG)
	gevent.spawn(refresh_workers)
	serve_forever()