Created
February 7, 2019 08:35
-
-
Save radxene/bcdca69952e57b8f035fdcba79b552c4 to your computer and use it in GitHub Desktop.
Habraproxy - local http proxy server (Хабрапрокси)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.parse import urlparse, urljoin | |
from http.server import BaseHTTPRequestHandler, HTTPServer | |
import requests | |
import lxml.etree as etree | |
class ManipStr(object): | |
@staticmethod | |
def rm_host_habra(url): | |
pr = urlparse(url) | |
if pr.netloc in ['habrahabr.ru', 'habr.com']: | |
return pr.path | |
return url | |
@staticmethod | |
def mark_tm(data): | |
words = data.split(' ') | |
new_words = [] | |
for w in words: | |
symbols = ['.', ',', ':', ';', '!', '?'] | |
char = '™' | |
w = w.strip() | |
if len(w) == 6 and w.isalnum(): | |
w += char | |
if len(w) == 7 and w[0:-1].isalnum() and w[-1] in symbols: | |
w = w[0:-1] + char + w[-1] | |
new_words.append(w) | |
return ' '.join(new_words) | |
class CollectorTarget(object): | |
def __init__(self): | |
self.parsed = '<!DOCTYPE html>' | |
self.cur_elem = '' | |
def start(self, tag, attrib): | |
self.cur_elem = tag | |
self.parsed += '<' + tag | |
if attrib: | |
for key, value in attrib.items(): | |
if tag == 'a' and key == 'href': | |
value = ManipStr.rm_host_habra(value) | |
self.parsed += ' {}="{}"'.format(key, value) | |
self.parsed += '>' | |
def end(self, tag): | |
if tag in ['meta', 'link']: | |
self.parsed = self.parsed[0:-1] + '/>' | |
else: | |
self.parsed += '</{}>'.format(tag) | |
def data(self, data): | |
if self.cur_elem != 'style' and self.cur_elem != 'script': | |
self.parsed += ManipStr.mark_tm(data) | |
def comment(self, comment): | |
self.parsed += '<!--{}-->'.format(comment) | |
def close(self): | |
return self.parsed | |
class ProxyHandler(BaseHTTPRequestHandler): | |
def _set_headers(self): | |
self.send_response(200) | |
self.send_header('Content-type', 'text/html') | |
self.end_headers() | |
def do_GET(self): | |
old_host = 'http://habrahabr.ru' | |
new_host = 'https://habr.com' | |
if self.path == '/': | |
url = urljoin(new_host, '/ru/top') | |
elif self.path.startswith('/ru/'): | |
url = urljoin(new_host, self.path) | |
else: | |
url = urljoin(old_host, self.path) | |
res = requests.get(url) | |
if res.status_code == requests.codes.ok: | |
parser = etree.HTMLParser(target=CollectorTarget()) | |
content = etree.HTML(res.text, parser).encode('utf-8') | |
try: | |
self._set_headers() | |
self.wfile.write(content) | |
except BrokenPipeError: | |
pass | |
if __name__ == '__main__': | |
from sys import argv | |
hostname = 'localhost' | |
port = 8232 | |
if len(argv) == 2: | |
port = int(argv[1]) | |
server_class = HTTPServer | |
httpd = server_class((hostname, port), ProxyHandler) | |
print('Server Starts - http://{}:{}'.format(hostname, port)) | |
try: | |
httpd.serve_forever() | |
except KeyboardInterrupt: | |
pass | |
httpd.server_close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment