Skip to content

Instantly share code, notes, and snippets.

@Guest007
Last active August 29, 2015 14:27
Show Gist options
  • Save Guest007/d21d33d5c46e3b66a1e2 to your computer and use it in GitHub Desktop.
Save Guest007/d21d33d5c46e3b66a1e2 to your computer and use it in GitHub Desktop.
#coding=utf8
import sys
import SocketServer
import SimpleHTTPServer
import requests
from bs4 import BeautifulSoup
import string
class Proxy(SimpleHTTPServer.SimpleHTTPRequestHandler):
""" По сути это не прокси, а http сервер """
target = 'http://habrahabr.ru'
p = string.punctuation + u'«»'
def mangle_html(self, html):
""" основной метод для модификации документа"""
skip = ('style', 'script', '[document]', 'head', 'title')
links = ['ht', '//'] # признаки того, что ссылка - полная
soup = BeautifulSoup(html)
# Здесь подкорректируем для загрузки стилей, картинок и внутр. ссылок
for l in soup.findAll('link'):
if l['href'][:2] not in links:
l['href'] = self.target + l['href']
for i in soup.findAll('img'):
if i['src'][:2] not in links:
i['src'] = self.target + i['src']
for a in soup.findAll('a'):
try:
if a['href'][:2] not in links:
a['href'] = self.target + a['href']
except:
pass # Пропустим в любом случае
# Можно ещё добавить корректировку путей загрузки скриптов
# for s in soup.findAll('script'):
# try:
# if s['src'][:2] not in links:
# s['src'] = self.target + s['src']
# except:
# pass # Пропустим в любом случае
for tag in soup.find_all(text=True):
if tag.parent.name in skip:
continue # только текст
s = tag.string.strip()
if s:
tag.string.replace_with(self.mangle(s))
return soup.encode('utf-8')
def is_tm_word(self, s):
return len(s.strip(self.p)) == 6
def mangle(self, s):
words = []
for x in s.split():
if self.is_tm_word(x):
word = x.strip(self.p) # Заменим само слово, без пунктуации
x = x.replace(word, word + u'™')
words.append(x)
if str(words[:1]) in '.,;:':
return u' '.join(words)
else:
return u' ' + u' '.join(words)
def do_GET(self):
""" Переопределяем имеющийся метод
для реализации нашего функционала """
r = requests.get(self.target + self.path)
instream = self.mangle_html(r.text)
self.send_response(r.status_code)
self.send_header('content-type', r.headers['content-type'])
self.end_headers()
self.wfile.write(instream)
def main():
if sys.argv[1:]:
PORT = int(sys.argv[1])
else:
PORT = 8000
httpd = SocketServer.TCPServer(('', PORT), Proxy)
print "Habraproxing at port", PORT
httpd.serve_forever()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment