Last active
January 6, 2017 09:03
-
-
Save adilkhash/050d7ba461f29c6977d0a1f9a67297ac to your computer and use it in GitHub Desktop.
Ivelum test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf8 -*- | |
import re | |
import string | |
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer | |
import requests | |
from bs4 import BeautifulSoup | |
def get_response(url): | |
return requests.get(url, timeout=(10, 20)) | |
def escape(s): | |
return s.replace('<', '<').replace('>', '>') | |
def trademark_it(word): | |
def clean(s): | |
return re.sub(ur'[^а-яА-ЯёЁa-zA-Z_]+', '', s.replace(' ', ''), flags=re.IGNORECASE) | |
cleaned = clean(word) | |
result = '' | |
i = word.find(cleaned) | |
if i > 0: # восстанавливаем символы, обрезанные вначале строки | |
result = u'{0}'.format(word[0:i]) | |
if len(cleaned) == 6: | |
result += u'{0}™{1}'.format(cleaned, word[6+i:]) # восстанавливаем слово со знаком | |
return result | |
return u'{0}'.format(word) | |
def proccess_text(text): | |
def strip(text): | |
return string.strip(text, ' ') # удаляем только обычные пробелы | |
# конвертируем неразрывный пробел, чтобы не потерять | |
# добавляем кастомный разделитель, чтобы не потерять обычный пробел | |
text = text.replace(u'\N{NO-BREAK SPACE}', ' ').replace(' ', '&csp;') | |
words = map(trademark_it, map(strip, text.split('&csp;'))) | |
result = u' '.join(map(escape, words)) | |
return result | |
def process_text_nodes(nodes): | |
for title in nodes: | |
for child in title.contents: | |
if child.string is None: | |
continue | |
child.string.replace_with(proccess_text(unicode(child.string))) | |
def replace_habr_urls(content): | |
text = re.sub(r'https://habrahabr\.ru', '', content, flags=re.IGNORECASE) | |
text = re.sub(r'link href="/', 'link href="https://habrahabr.ru/', text, flags=re.IGNORECASE) | |
return text | |
def hijack_habr(url): | |
response = get_response(url) | |
html_content = replace_habr_urls(response.content) | |
soup = BeautifulSoup(html_content, 'html.parser') | |
process_text_nodes(soup.find_all('h1', class_='post__title')) | |
process_text_nodes(soup.find_all('h2', class_='post__title')) | |
process_text_nodes(soup.find_all('div', class_='content')) | |
process_text_nodes(soup.find_all('span', class_='tab-item__value')) | |
process_text_nodes(soup.find_all('div', class_='buttons')) | |
return soup.encode(encoding='utf-8', formatter=None) | |
def build_url(uri): | |
return 'https://habrahabr.ru{0}'.format(uri) | |
class HttpProcessor(BaseHTTPRequestHandler): | |
def handle_fonts(self): | |
response = requests.get(build_url(self.path)) | |
self.send_response(response.status_code) | |
for k, v in response.headers.iteritems(): | |
self.send_header(k, v) | |
self.end_headers() | |
self.wfile.write(response.content) | |
def do_GET(self): | |
if self.path.startswith('/fonts/'): | |
self.handle_fonts() | |
else: | |
self.send_response(200) | |
self.send_header('content-type', 'text/html') | |
self.end_headers() | |
self.wfile.write(hijack_habr(build_url(self.path))) | |
serv = HTTPServer(('localhost', 8080), HttpProcessor) | |
serv.serve_forever() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment