Skip to content

Instantly share code, notes, and snippets.

@asasuou
Forked from fqrouter/worker nginx conf
Created October 1, 2015 17:35
Show Gist options
  • Save asasuou/47d856b785236284817a to your computer and use it in GitHub Desktop.
Save asasuou/47d856b785236284817a to your computer and use it in GitHub Desktop.
Youtube Reverse Proxy
resolver 8.8.8.8;
location /video/ {
if ($request_uri ~ "^/video/(.+?)/.+") {
set $upstream_host $1.googlevideo.com;
add_header Content-Disposition "attachment; filename=video.mp4;";
}
rewrite /video/.+?/(.+)$ /$1 break;
proxy_buffering off;
proxy_pass https://$upstream_host;
proxy_set_header Host $upstream_host;
}
location /image/ {
if ($request_uri ~ "^/image/(.+?)/.+") {
set $upstream_host $1.ytimg.com;
}
rewrite /image/.+?/(.+)$ /$1 break;
proxy_buffering off;
proxy_pass http://$upstream_host;
proxy_set_header Host $upstream_host;
}
location /photo/ {
if ($request_uri ~ "^/photo/(.+?)/.+") {
set $upstream_host $1.ggpht.com;
}
rewrite /photo/.+?/(.+)$ /$1 break;
proxy_buffering off;
proxy_pass http://$upstream_host;
proxy_set_header Host $upstream_host;
}
#!/usr/bin/env python
import logging
import httplib
import os
import subprocess
import socket
import datetime
import random
import signal
import urllib2
import urlparse
import urllib
import re
from gevent.wsgi import WSGIServer
import gevent.monkey
import gevent.pool
import gevent
import redis
import cgi
import functools
import Cookie
REDIS = redis.StrictRedis()
gevent.monkey.patch_all(subprocess=True)
proc_pool = gevent.pool.Pool(size=16)
LOGGER = logging.getLogger(__name__)
WORKERS = {
} # you need to fill this
LISTEN_IP = ''
LISTEN_PORT = 3000
RE_YTIMG_CSS = re.compile(r'/s.ytimg\.com(.*?\.css)', re.IGNORECASE)
RE_YTIMG_ESC = re.compile(r'\\/([a-zA-Z0-9-]+?)\.ytimg\.com', re.IGNORECASE)
RE_YTIMG = re.compile(r'/([a-zA-Z0-9-]+?)\.ytimg\.com', re.IGNORECASE)
RE_GGPHT = re.compile(r'https://([a-zA-Z0-9-]+?)\.ggpht\.com', re.IGNORECASE)
RE_GOOGLEVIDEO = re.compile(r'/([a-zA-Z0-9-]+?)\.googlevideo\.com', re.IGNORECASE)
def handle_request(environ, start_response):
method = environ.get('REQUEST_METHOD')
try:
lines = handle(environ, lambda status, headers: start_response(get_http_response(status), headers))
except:
path = environ.get('PATH_INFO', '').strip('/')
LOGGER.exception('failed to handle request: %s %s' % (method, path))
start_response('500 INTERNAL_SERVER_ERROR', [
('Content-Type', 'text/javascript'),
('Cache-Control', 'no-cache, no-store, must-revalidate'),
('Pragma', 'no-cache'),
('Expires', '0')])
lines = ['Retry in 30 minutes']
for line in lines:
yield line
def get_http_response(code):
if code not in httplib.responses:
return code
return '%s %s' % (code, httplib.responses[code])
def replace_ytimg_css(match):
return '/your-reverse-proxy-ip/css/%s' % match.group(1)
def replace_ytimg_esc(match):
return '\\/%s\\/image/%s' % (pick_worker()[0], match.group(1))
def replace_ytimg(match):
return '/%s/image/%s' % (pick_worker()[0], match.group(1))
def replace_ggpht(match):
return 'http://%s/photo/%s' % (pick_worker()[0], match.group(1))
def replace_googlevideo(worker, match):
return '/%s/video/%s' % (worker[0], match.group(1))
def handle(environ, start_response):
host = 'youtube.com'
path = environ.get('PATH_INFO', '')
if '/watch' == path:
video_id = urlparse.parse_qs(environ['QUERY_STRING'])['v'][0]
return handle_watch(video_id, environ, start_response)
if '/watch_videos' == path:
video_id = urlparse.parse_qs(environ['QUERY_STRING'])['video_ids'][0].split(',')[0]
return handle_watch(video_id, environ, start_response)
if path.startswith('/css/'):
upstream_url = path.replace('/css/', '')
upstream_url = 'http://s.ytimg.com/%s' % upstream_url
return handle_css(upstream_url, environ, start_response)
if path.startswith('/t/'):
domain = path.replace('/t/', '').replace('.js', '')
words = 'window.location.href="http://%s";' % domain
start_response(httplib.OK, [
('Content-Type', 'text/javascript'),
('Cache-Control', 'no-cache, no-store, must-revalidate'),
('Pragma', 'no-cache'),
('Expires', '0')])
return [words]
if path.startswith('//'):
start_response(httplib.FOUND, [
('Location', 'http:%s' % path)
])
return []
data = None
if 'POST' == environ['REQUEST_METHOD']:
if '/results' == path:
post_body = cgi.FieldStorage(
fp=environ['wsgi.input'],
environ=environ,
keep_blank_values=True)
upstream_url = 'http://youtube.com/results?%s' % urllib.urlencode({'search_query': post_body['search_query'].value})
else:
data = environ['wsgi.input'].readline()
upstream_url = 'http://%s%s' % (host, path)
else:
upstream_url = 'http://%s%s' % (host, path)
if environ['QUERY_STRING']:
upstream_url = '%s?%s' % (upstream_url, environ['QUERY_STRING'])
LOGGER.info('upstream url: %s' % upstream_url)
headers = {}
if environ.get('HTTP_COOKIE'):
LOGGER.info('cookie is: %s' % environ.get('HTTP_COOKIE'))
headers['Cookie'] = environ.get('HTTP_COOKIE')
try:
response = urllib2.urlopen(urllib2.Request(upstream_url, data=data, headers=headers))
except urllib2.HTTPError as e:
start_response(e.code, [(k, v) for k, v in e.hdrs.items()])
return [e.msg]
except:
raise
headers = []
for k, v in response.headers.items():
if 'set-cookie' == k.lower():
v = v.replace('domain=.youtube.com;', '')
if 'x-frame' in k.lower():
continue
headers.append((k, v))
start_response(httplib.OK, headers)
body = response.read()
body = RE_YTIMG_CSS.sub(replace_ytimg_css, body)
body = RE_YTIMG_ESC.sub(replace_ytimg_esc, body)
body = RE_YTIMG.sub(replace_ytimg, body)
body = RE_GGPHT.sub(replace_ggpht, body)
# body = body.replace('class="search-form', 'method="POST" class="search-form')
body = body.replace('class="video-masthead">', 'style="display: none;">')
body = body.replace('class="branded-page-v2-top-row">', 'style="display: none;">')
body = body.replace('style="z-index: 1">', 'style="display: none;">')
body = body.replace('style="z-index: 1;">', 'style="display: none;">')
body = body.replace('class="premium-yva-unexpanded"', 'style="display: none;"')
# body = body.replace('id="masthead-search"', 'style="position: relative; padding: 0; margin-top: 3px; overflow: hidden;"')
body = body.replace('ad.doubleclick.net', '127.0.0.1')
body = body.replace('www.youtube.com', 'your-reverse-proxy-ip')
return [body]
def handle_css(upstream_url, environ, start_response):
response = urllib2.urlopen(urllib2.Request(upstream_url))
headers = []
for k, v in response.headers.items():
if 'set-cookie' == k.lower():
v = v.replace('domain=.youtube.com;', '')
headers.append((k, v))
start_response(httplib.OK, headers)
body = response.read()
body = RE_YTIMG.sub(replace_ytimg, body)
return [body]
def handle_watch(video_id, environ, start_response):
video_url = REDIS.get(video_id)
if video_url:
LOGGER.info('%s hit cache' % video_id)
else:
LOGGER.info('get url for movie: %s' % video_id)
try:
video_url = proc_pool.spawn(get_url, video_id).get()
except:
LOGGER.exception('failed to get url')
start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
return ['no valid url']
if 'googlevideo.com' not in video_url:
LOGGER.error('googlevideo.com not in url: %s' % video_url)
start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
return ['no valid url']
video_url = video_url.replace('https://', 'http://')
history = set()
success = False
for i in range(3):
worker = pick_worker(history)
try_url = RE_GOOGLEVIDEO.sub(functools.partial(replace_googlevideo, worker), video_url)
if is_url_correct(try_url):
video_url = try_url
success = True
break
# else:
# worker[1] = False
if not success:
start_response(httplib.BAD_GATEWAY, [('Content-Type', 'text/plain')])
return ['no valid url']
REDIS.set(video_id, video_url)
REDIS.expire(video_id, 60 * 3)
LOGGER.info('got url for movie: %s %s' % (video_id, video_url))
start_response(httplib.FOUND, [
('Location', video_url),
('Content-Type', 'text/plain'),
('Cache-Control', 'max-age=180')
])
return ['you can use this link to download the movie']
def get_url(video_id):
if '/' in video_id:
raise Exception('evil')
return subprocess.check_output(
'youtube-dl http://www.youtube.com/watch?v=%s -g'
% video_id, shell=True).strip()
def serve_forever():
try:
server = WSGIServer((LISTEN_IP, LISTEN_PORT), handle_request)
LOGGER.info('serving HTTP on port %s:%s...' % (LISTEN_IP, LISTEN_PORT))
except:
LOGGER.exception('failed to start HTTP server on port %s:%s' % (LISTEN_IP, LISTEN_PORT))
os._exit(1)
server.serve_forever()
def pick_worker(history=()):
if len(history) >= len(WORKERS):
raise Exception('no worker')
server_name = random.choice(WORKERS.keys())
worker = random.choice(WORKERS[server_name])
if not worker[1]:
return pick_worker(set(list(history) + [server_name]))
return worker
def is_url_correct(url):
class NoRedirectHandler(urllib2.HTTPRedirectHandler):
def http_error_302(self, req, fp, code, msg, headers):
infourl = urllib.addinfourl(fp, headers, req.get_full_url())
infourl.status = code
infourl.code = code
return infourl
http_error_300 = http_error_302
http_error_301 = http_error_302
http_error_303 = http_error_302
http_error_307 = http_error_302
try:
opener = urllib2.build_opener(NoRedirectHandler())
response = opener.open(url)
response.close()
if 200 == response.code:
return True
else:
LOGGER.error('status code %s for url %s' % (response.code, url))
return False
except:
LOGGER.exception('try url failed: %s' % url)
return False
def refresh_workers():
while True:
for workers in WORKERS.values():
for worker in workers:
worker[1] = is_worker_alive(worker[0])
LOGGER.info('%s refreshed workers' % datetime.datetime.now())
gevent.sleep(60 * 60)
def is_worker_alive(worker_host):
try:
urllib2.urlopen('http://%s/image/i1/vi/tLcfAnN2QgY/mqdefault.jpg' % worker_host, timeout=3).close()
LOGGER.info('%s => OK' % worker_host)
return True
except:
LOGGER.info('%s => FAILURE' % worker_host)
return False
signal.signal(signal.SIGINT, lambda signum, fame: os._exit(0))
logging.basicConfig(level=logging.DEBUG)
gevent.spawn(refresh_workers)
serve_forever()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment