Created
March 2, 2012 20:41
-
-
Save mwicat/1961160 to your computer and use it in GitHub Desktop.
twisted proxy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sgmllib, re | |
| from twisted.web import proxy, http | |
| import sys | |
| from twisted.python import log | |
| log.startLogging(sys.stdout) | |
| import unplug | |
| WEB_PORT = 8000 | |
| PROXY_PORT = 8001 | |
| class WordCountProxyClient(proxy.ProxyClient): | |
| def handleHeader(self, key, value): | |
| proxy.ProxyClient.handleHeader(self, key, value) | |
| if key.lower() == "content-type": | |
| self.is_html = value.split(';')[0] == 'text/html' | |
| self.data = '' | |
| def isScrapable(self): | |
| res = self.father.uri.endswith('#unplug') and not 'referer' in self.headers and hasattr(self, 'is_html') and self.is_html | |
| return res | |
| def handleResponsePart(self, data): | |
| if self.isScrapable(): | |
| self.data += data | |
| else: | |
| proxy.ProxyClient.handleResponsePart(self, data) | |
| def handleResponseEnd(self): | |
| if self.isScrapable(): | |
| reactor.callInThread(dump_links, self.father.uri) | |
| data = self.data | |
| self.father.write(data) | |
| self.data = '' | |
| proxy.ProxyClient.handleResponseEnd(self) | |
| class WordCountProxyClientFactory(proxy.ProxyClientFactory): | |
| def buildProtocol(self, addr): | |
| client = proxy.ProxyClientFactory.buildProtocol(self, addr) | |
| client.__class__ = WordCountProxyClient | |
| return client | |
| class WordCountProxyRequest(proxy.ProxyRequest): | |
| protocols = {'http': WordCountProxyClientFactory} | |
| def __init__(self, *args): | |
| proxy.ProxyRequest.__init__(self, *args) | |
| def process(self): | |
| parsed = urlparse.urlparse(self.uri) | |
| protocol = parsed[0] | |
| if not protocol: | |
| return | |
| host = parsed[1] | |
| port = self.ports[protocol] | |
| class WordCountProxy(proxy.Proxy): | |
| def __init__(self): | |
| proxy.Proxy.__init__(self) | |
| def requestFactory(self, *args): | |
| return WordCountProxyRequest(*args) | |
| class WordCountProxyFactory(http.HTTPFactory): | |
| protocol = WordCountProxy | |
| def __init__(self): | |
| http.HTTPFactory.__init__(self) | |
| def buildProtocol(self, addr): | |
| protocol = WordCountProxy() | |
| return protocol | |
| if __name__ == "__main__": | |
| from twisted.internet import reactor | |
| prox = WordCountProxyFactory() | |
| reactor.listenTCP(PROXY_PORT, prox) | |
| reactor.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment