muravjov · October 17, 2016 12:27
diff --git a/proxy.py b/proxy.py
 #!/usr/bin/env python
 # coding: utf-8

 """
 Setup:
 $ pip install urllib3 lxml git+https://github.com/bpabel/html5charref.git
 """

 from __future__ import print_function
 import re

 # No html5 entities in Python 2 (like html.unescape in Python3), so
 import html5charref


 def main():
    import SimpleHTTPServer as shttps

    import urllib3
    # :TODO: http://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
    urllib3.disable_warnings()

    import lxml.html as l_html

    http = urllib3.PoolManager()

    domain = "habrahabr.ru"
    prefixes = ["http://", "https://"]

    class Handler(shttps.SimpleHTTPRequestHandler):
        def do_GET(self):
            url = "http://" + domain + self.path
            resp = http.urlopen("GET", url, preload_content=False)

            status = resp.status
            ctype = resp.getheader("Content-Type", "text/plain")
            body = resp.read()

            lst = ctype.split(";")
            if lst and lst[0] == "text/html":
                tree = l_html.fromstring(body, parser=l_html.html_parser)
                for elem in tree.iter():
                    if not(elem.tag in ["script"]):
                        elem.text = append_tm(elem.text)
                        elem.tail = append_tm(elem.tail)

                        # https://habrahabr.ru/company/plarium/blog/312318/ =>
                        # /company/plarium/blog/312318/
                        href = elem.attrib.get("href")
                        if href:
                            for prefix in prefixes:
                                prefix += domain
                                if href.startswith(prefix):
                                    href = href[len(prefix):]
                                    elem.attrib["href"] = href
                                    break

                # body = etree.tostring(tree)
                doctype = tree.getroottree().docinfo.doctype
                body = l_html.tostring(tree, encoding="utf-8", doctype=doctype)

            # output
            self.send_response(status)

            self.send_header("Content-type", ctype)
            self.send_header("Content-Length", str(len(body)))
            self.end_headers()

            self.wfile.write(body)

    shttps.BaseHTTPServer.test(Handler, shttps.BaseHTTPServer.HTTPServer)


 def on_word(match):
    res = match.group()
    if len(res) - res.count("-") == 6:
        res += u"™"
    return res


 def append_tm(txt):
    res = txt
    if res:  # != None
        # &plus; - специфика хабра
        txt = html5charref.unescape(txt)
        res = re.sub(ur"\b[\w-]+\b", on_word, txt, flags=re.U)
    return res

 if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# coding: utf-8

	"""
	Setup:
	$ pip install urllib3 lxml git+https://github.com/bpabel/html5charref.git
	"""

	from __future__ import print_function
	import re

	# No html5 entities in Python 2 (like html.unescape in Python3), so
	import html5charref


	def main():
	import SimpleHTTPServer as shttps

	import urllib3
	# :TODO: http://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
	urllib3.disable_warnings()

	import lxml.html as l_html

	http = urllib3.PoolManager()

	domain = "habrahabr.ru"
	prefixes = ["http://", "https://"]

	class Handler(shttps.SimpleHTTPRequestHandler):
	def do_GET(self):
	url = "http://" + domain + self.path
	resp = http.urlopen("GET", url, preload_content=False)

	status = resp.status
	ctype = resp.getheader("Content-Type", "text/plain")
	body = resp.read()

	lst = ctype.split(";")
	if lst and lst[0] == "text/html":
	tree = l_html.fromstring(body, parser=l_html.html_parser)
	for elem in tree.iter():
	if not(elem.tag in ["script"]):
	elem.text = append_tm(elem.text)
	elem.tail = append_tm(elem.tail)

	# https://habrahabr.ru/company/plarium/blog/312318/ =>
	# /company/plarium/blog/312318/
	href = elem.attrib.get("href")
	if href:
	for prefix in prefixes:
	prefix += domain
	if href.startswith(prefix):
	href = href[len(prefix):]
	elem.attrib["href"] = href
	break

	# body = etree.tostring(tree)
	doctype = tree.getroottree().docinfo.doctype
	body = l_html.tostring(tree, encoding="utf-8", doctype=doctype)

	# output
	self.send_response(status)

	self.send_header("Content-type", ctype)
	self.send_header("Content-Length", str(len(body)))
	self.end_headers()

	self.wfile.write(body)

	shttps.BaseHTTPServer.test(Handler, shttps.BaseHTTPServer.HTTPServer)


	def on_word(match):
	res = match.group()
	if len(res) - res.count("-") == 6:
	res += u"™"
	return res


	def append_tm(txt):
	res = txt
	if res: # != None
	# + - специфика хабра
	txt = html5charref.unescape(txt)
	res = re.sub(ur"\b[\w-]+\b", on_word, txt, flags=re.U)
	return res

	if __name__ == '__main__':
	main()
No results found