adilkhash · January 6, 2017 09:03
diff --git a/habra_pr0xy.py b/habra_pr0xy.py
 # -*- coding: utf8 -*-

 import re
 import string
 from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer

 import requests
 from bs4 import BeautifulSoup


 def get_response(url):
    return requests.get(url, timeout=(10, 20))


 def escape(s):
    return s.replace('<', '&lt;').replace('>', '&gt;')


 def trademark_it(word):
    def clean(s):
        return re.sub(ur'[^а-яА-ЯёЁa-zA-Z_]+', '', s.replace('&nbsp;', ''), flags=re.IGNORECASE)

    cleaned = clean(word)
    result = ''
    i = word.find(cleaned)
    if i > 0:  # восстанавливаем символы, обрезанные вначале строки
        result = u'{0}'.format(word[0:i])

    if len(cleaned) == 6:
        result += u'{0}™{1}'.format(cleaned, word[6+i:])  # восстанавливаем слово со знаком
        return result

    return u'{0}'.format(word)


 def proccess_text(text):
    def strip(text):
        return string.strip(text, ' ')  # удаляем только обычные пробелы

    # конвертируем неразрывный пробел, чтобы не потерять
    # добавляем кастомный разделитель, чтобы не потерять обычный пробел
    text = text.replace(u'\N{NO-BREAK SPACE}', '&nbsp;').replace(' ', '&csp;')
    words = map(trademark_it, map(strip, text.split('&csp;')))
    result = u' '.join(map(escape, words))
    return result


 def process_text_nodes(nodes):
    for title in nodes:
        for child in title.contents:
            if child.string is None:
                continue
            child.string.replace_with(proccess_text(unicode(child.string)))


 def replace_habr_urls(content):
    text = re.sub(r'https://habrahabr\.ru', '', content, flags=re.IGNORECASE)
    text = re.sub(r'link href="/', 'link href="https://habrahabr.ru/', text, flags=re.IGNORECASE)
    return text


 def hijack_habr(url):
    response = get_response(url)
    html_content = replace_habr_urls(response.content)
    soup = BeautifulSoup(html_content, 'html.parser')
    process_text_nodes(soup.find_all('h1', class_='post__title'))
    process_text_nodes(soup.find_all('h2', class_='post__title'))
    process_text_nodes(soup.find_all('div', class_='content'))
    process_text_nodes(soup.find_all('span', class_='tab-item__value'))
    process_text_nodes(soup.find_all('div', class_='buttons'))
    return soup.encode(encoding='utf-8', formatter=None)


 def build_url(uri):
    return 'https://habrahabr.ru{0}'.format(uri)


 class HttpProcessor(BaseHTTPRequestHandler):
    def handle_fonts(self):
        response = requests.get(build_url(self.path))
        self.send_response(response.status_code)
        for k, v in response.headers.iteritems():
            self.send_header(k, v)
        self.end_headers()
        self.wfile.write(response.content)

    def do_GET(self):
        if self.path.startswith('/fonts/'):
            self.handle_fonts()
        else:
            self.send_response(200)
            self.send_header('content-type', 'text/html')
            self.end_headers()
            self.wfile.write(hijack_habr(build_url(self.path)))

 serv = HTTPServer(('localhost', 8080), HttpProcessor)
 serv.serve_forever()
	# -- coding: utf8 --

	import re
	import string
	from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer

	import requests
	from bs4 import BeautifulSoup


	def get_response(url):
	return requests.get(url, timeout=(10, 20))


	def escape(s):
	return s.replace('<', '<').replace('>', '>')


	def trademark_it(word):
	def clean(s):
	return re.sub(ur'[^а-яА-ЯёЁa-zA-Z_]+', '', s.replace(' ', ''), flags=re.IGNORECASE)

	cleaned = clean(word)
	result = ''
	i = word.find(cleaned)
	if i > 0: # восстанавливаем символы, обрезанные вначале строки
	result = u'{0}'.format(word[0:i])

	if len(cleaned) == 6:
	result += u'{0}™{1}'.format(cleaned, word[6+i:]) # восстанавливаем слово со знаком
	return result

	return u'{0}'.format(word)


	def proccess_text(text):
	def strip(text):
	return string.strip(text, ' ') # удаляем только обычные пробелы

	# конвертируем неразрывный пробел, чтобы не потерять
	# добавляем кастомный разделитель, чтобы не потерять обычный пробел
	text = text.replace(u'\N{NO-BREAK SPACE}', ' ').replace(' ', '&csp;')
	words = map(trademark_it, map(strip, text.split('&csp;')))
	result = u' '.join(map(escape, words))
	return result


	def process_text_nodes(nodes):
	for title in nodes:
	for child in title.contents:
	if child.string is None:
	continue
	child.string.replace_with(proccess_text(unicode(child.string)))


	def replace_habr_urls(content):
	text = re.sub(r'https://habrahabr\.ru', '', content, flags=re.IGNORECASE)
	text = re.sub(r'link href="/', 'link href="https://habrahabr.ru/', text, flags=re.IGNORECASE)
	return text


	def hijack_habr(url):
	response = get_response(url)
	html_content = replace_habr_urls(response.content)
	soup = BeautifulSoup(html_content, 'html.parser')
	process_text_nodes(soup.find_all('h1', class_='post__title'))
	process_text_nodes(soup.find_all('h2', class_='post__title'))
	process_text_nodes(soup.find_all('div', class_='content'))
	process_text_nodes(soup.find_all('span', class_='tab-item__value'))
	process_text_nodes(soup.find_all('div', class_='buttons'))
	return soup.encode(encoding='utf-8', formatter=None)


	def build_url(uri):
	return 'https://habrahabr.ru{0}'.format(uri)


	class HttpProcessor(BaseHTTPRequestHandler):
	def handle_fonts(self):
	response = requests.get(build_url(self.path))
	self.send_response(response.status_code)
	for k, v in response.headers.iteritems():
	self.send_header(k, v)
	self.end_headers()
	self.wfile.write(response.content)

	def do_GET(self):
	if self.path.startswith('/fonts/'):
	self.handle_fonts()
	else:
	self.send_response(200)
	self.send_header('content-type', 'text/html')
	self.end_headers()
	self.wfile.write(hijack_habr(build_url(self.path)))

	serv = HTTPServer(('localhost', 8080), HttpProcessor)
	serv.serve_forever()