Skip to content

Instantly share code, notes, and snippets.

@ANtlord
Last active August 29, 2015 14:22
Show Gist options
  • Select an option

  • Save ANtlord/41f403ad8c25860aab5a to your computer and use it in GitHub Desktop.

Select an option

Save ANtlord/41f403ad8c25860aab5a to your computer and use it in GitHub Desktop.
#!/usr/bin/python2
# -*- coding: utf-8 -*-
import SimpleHTTPServer
import SocketServer
import signal
import sys
import urllib2
import gzip
import zlib
import webbrowser
import os
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from StringIO import StringIO
port = 8000
def get_url_data(url):
"""Gets data y by pointed url.
:url: str
:return: str
"""
res = urllib2.urlopen(url)
data = None
if res.info().get('Content-Encoding') == 'gzip':
buf = StringIO(res.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read()
f.close()
else:
data = res.read()
return data
def interpret_data(data):
"""Interprets data by pointer task.
:type data: unicode
"""
out_soup = BeautifulSoup()
soup = BeautifulSoup(data)
counter = 1
for item in soup.find_all(True):
strings = filter(lambda x: type(x) == NavigableString, item.contents)
if (item.name not in ('script', 'noscript', 'style',) and
len(strings)):
i = 0
for string in strings:
words = string.split(' ')
j = counter
for word in words:
if len(word.strip()):
if counter % 6 == 0:
word += u'™'
words[counter - j] = word
counter += 1
string.replace_with(' '.join(words))
i += 1
elif item.name in ('script', 'link',):
path = None
key = 'src' if item.name == 'script' else 'href'
path = item.attrs.get(key, None)
if path and path.find('/') == 0:
item.attrs[key] = 'http://habrahabr.ru'+path
# Edits links on page.
for item in soup.find_all('a'):
if 'href' in item.attrs:
item.attrs['href'] = item.attrs['href'].replace(
'http://habrahabr.ru/', 'http://localhost:%s/' % port)
return soup
class MyHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
def get_habradata(self):
url = u'http://habrahabr.ru%s' % self.path
data = get_url_data(url)
soup = interpret_data(data)
return soup
def do_GET(self):
self.send_response(200, 'OK')
self.send_header('Content-type', 'text/html; charset=utf8')
self.end_headers()
res = self.get_habradata()
self.wfile.write(res)
def main():
if sys.argv[1:]:
# I know that it is bad practice.
global port
port = int(sys.argv[1])
webbrowser.open_new_tab('http://localhost:%s' % port)
httpd = SocketServer.TCPServer(('127.0.0.1', port), MyHandler)
httpd.serve_forever()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment