Skip to content

Instantly share code, notes, and snippets.

@mwicat
Created March 2, 2012 20:41
Show Gist options
  • Select an option

  • Save mwicat/1961160 to your computer and use it in GitHub Desktop.

Select an option

Save mwicat/1961160 to your computer and use it in GitHub Desktop.
twisted proxy
import sgmllib, re
from twisted.web import proxy, http
import sys
from twisted.python import log
log.startLogging(sys.stdout)
import unplug
WEB_PORT = 8000
PROXY_PORT = 8001
class WordCountProxyClient(proxy.ProxyClient):
def handleHeader(self, key, value):
proxy.ProxyClient.handleHeader(self, key, value)
if key.lower() == "content-type":
self.is_html = value.split(';')[0] == 'text/html'
self.data = ''
def isScrapable(self):
res = self.father.uri.endswith('#unplug') and not 'referer' in self.headers and hasattr(self, 'is_html') and self.is_html
return res
def handleResponsePart(self, data):
if self.isScrapable():
self.data += data
else:
proxy.ProxyClient.handleResponsePart(self, data)
def handleResponseEnd(self):
if self.isScrapable():
reactor.callInThread(dump_links, self.father.uri)
data = self.data
self.father.write(data)
self.data = ''
proxy.ProxyClient.handleResponseEnd(self)
class WordCountProxyClientFactory(proxy.ProxyClientFactory):
def buildProtocol(self, addr):
client = proxy.ProxyClientFactory.buildProtocol(self, addr)
client.__class__ = WordCountProxyClient
return client
class WordCountProxyRequest(proxy.ProxyRequest):
protocols = {'http': WordCountProxyClientFactory}
def __init__(self, *args):
proxy.ProxyRequest.__init__(self, *args)
def process(self):
parsed = urlparse.urlparse(self.uri)
protocol = parsed[0]
if not protocol:
return
host = parsed[1]
port = self.ports[protocol]
class WordCountProxy(proxy.Proxy):
def __init__(self):
proxy.Proxy.__init__(self)
def requestFactory(self, *args):
return WordCountProxyRequest(*args)
class WordCountProxyFactory(http.HTTPFactory):
protocol = WordCountProxy
def __init__(self):
http.HTTPFactory.__init__(self)
def buildProtocol(self, addr):
protocol = WordCountProxy()
return protocol
if __name__ == "__main__":
from twisted.internet import reactor
prox = WordCountProxyFactory()
reactor.listenTCP(PROXY_PORT, prox)
reactor.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment