Last active
August 29, 2015 14:27
-
-
Save Guest007/d21d33d5c46e3b66a1e2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf8 | |
import sys | |
import SocketServer | |
import SimpleHTTPServer | |
import requests | |
from bs4 import BeautifulSoup | |
import string | |
class Proxy(SimpleHTTPServer.SimpleHTTPRequestHandler): | |
""" По сути это не прокси, а http сервер """ | |
target = 'http://habrahabr.ru' | |
p = string.punctuation + u'«»' | |
def mangle_html(self, html): | |
""" основной метод для модификации документа""" | |
skip = ('style', 'script', '[document]', 'head', 'title') | |
links = ['ht', '//'] # признаки того, что ссылка - полная | |
soup = BeautifulSoup(html) | |
# Здесь подкорректируем для загрузки стилей, картинок и внутр. ссылок | |
for l in soup.findAll('link'): | |
if l['href'][:2] not in links: | |
l['href'] = self.target + l['href'] | |
for i in soup.findAll('img'): | |
if i['src'][:2] not in links: | |
i['src'] = self.target + i['src'] | |
for a in soup.findAll('a'): | |
try: | |
if a['href'][:2] not in links: | |
a['href'] = self.target + a['href'] | |
except: | |
pass # Пропустим в любом случае | |
# Можно ещё добавить корректировку путей загрузки скриптов | |
# for s in soup.findAll('script'): | |
# try: | |
# if s['src'][:2] not in links: | |
# s['src'] = self.target + s['src'] | |
# except: | |
# pass # Пропустим в любом случае | |
for tag in soup.find_all(text=True): | |
if tag.parent.name in skip: | |
continue # только текст | |
s = tag.string.strip() | |
if s: | |
tag.string.replace_with(self.mangle(s)) | |
return soup.encode('utf-8') | |
def is_tm_word(self, s): | |
return len(s.strip(self.p)) == 6 | |
def mangle(self, s): | |
words = [] | |
for x in s.split(): | |
if self.is_tm_word(x): | |
word = x.strip(self.p) # Заменим само слово, без пунктуации | |
x = x.replace(word, word + u'™') | |
words.append(x) | |
if str(words[:1]) in '.,;:': | |
return u' '.join(words) | |
else: | |
return u' ' + u' '.join(words) | |
def do_GET(self): | |
""" Переопределяем имеющийся метод | |
для реализации нашего функционала """ | |
r = requests.get(self.target + self.path) | |
instream = self.mangle_html(r.text) | |
self.send_response(r.status_code) | |
self.send_header('content-type', r.headers['content-type']) | |
self.end_headers() | |
self.wfile.write(instream) | |
def main(): | |
if sys.argv[1:]: | |
PORT = int(sys.argv[1]) | |
else: | |
PORT = 8000 | |
httpd = SocketServer.TCPServer(('', PORT), Proxy) | |
print "Habraproxing at port", PORT | |
httpd.serve_forever() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment