Created
May 10, 2015 20:17
-
-
Save mdaniel/032cae7d161912d2a2ce to your computer and use it in GitHub Desktop.
Extracts the proxy list from proxylist.hidemyass.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from __future__ import absolute_import, print_function, unicode_literals | |
__docformat__ = 'reStructureText' | |
import logging | |
import re | |
import sys | |
import html5lib | |
from xml.etree.ElementTree import tostring | |
logging.basicConfig(level=logging.NOTSET) | |
LOG = logging.getLogger(__name__) | |
LOG.setLevel(logging.INFO) | |
DISPLAY_NONE_RE = re.compile(r'(?m)^\.(\S+)\s*\{\s*display\s*:\s*none\s*\}') | |
def find_display_none_classes(css_text): | |
""" | |
Finds the class names in the given css style text | |
that are ``display:none``. | |
For example, feeding this:: | |
thing { display:inline } | |
other { display:none } | |
will return:: | |
['other'] | |
:rtype: list[unicode] | |
""" | |
return DISPLAY_NONE_RE.findall(css_text) | |
def test_parse_css(): | |
print(find_display_none_classes(""" | |
.s-Aw{display:none} | |
.zJsC{display:inline} | |
.IQvs{display:none} | |
.yBZq{display:inline} | |
.CyxL{display:none} | |
.dt09{display:inline} | |
""")) | |
def extract_proxy_info(tr_el): | |
""" | |
Yields an (ip:port, proxy type, anon level) tuple | |
from the provided table row element. | |
:type tr_el: xml.etree.ElementTree.Element | |
:rtype: (unicode, unicode, unicode) | |
""" | |
port = None | |
kind = None | |
level = None | |
for td_num, td_el in enumerate(tr_el): | |
if not td_el.text: | |
continue | |
if re.match(r'(?m)\s*\d+\s*', td_el.text): | |
if port is not None: | |
raise Exception('Cannot cope with multiple ports: port=%s text=<<%s>>' | |
% (port, td_el.text)) | |
port = int(re.sub(r'\s*', '', td_el.text)) | |
elif re.match(r'HTTPS?|SOCKS[45]', td_el.text): | |
kind = td_el.text | |
elif 7 == td_num: # re.match(r'(?i)low|high') | |
level = td_el.text | |
if port is None: | |
raise ValueError('Expected port, found None') | |
# we have to grab the style's parent because ElementTree | |
# doesn't permit .parentNode and "dom" doesn't have .findall :-/ | |
#: :type: list[xml.etree.ElementTree.Element] | |
style_parent_nl = tr_el.findall('.//*[style]') | |
if not style_parent_nl: | |
LOG.error('Expected "style" parent, found None in %s', tostring(tr_el)) | |
return | |
if 1 != len(style_parent_nl): | |
LOG.warning('Expected only one style parent, found %d of them', | |
len(style_parent_nl)) | |
style_parent = style_parent_nl[0] | |
#: :type: xml.etree.ElementTree.Element | |
sty = style_parent.find('.//style') | |
if sty is None: | |
LOG.warning('Expected "style" child, found None: %s', | |
tostring(style_parent)) | |
return | |
bad_classes = find_display_none_classes(sty.text) | |
LOG.debug('style[%s] -= %s', sty.text, bad_classes) | |
#: :type: list[unicode] | |
parts = [] | |
for ip_el in style_parent: | |
LOG.debug('IP_EL=((%s))', tostring(ip_el)) | |
if 'style' == ip_el.tag: | |
# don't continue or you'll eat the .tail text | |
# which very well could contain a number or dot | |
pass | |
elif 'class' in ip_el.attrib: | |
sp_class = ip_el.attrib['class'] | |
css = [] | |
if sp_class: | |
css = sp_class.split(' ') | |
ok = reduce(lambda a, b: a and b, | |
[x not in bad_classes for x in css], | |
True) | |
if ok: | |
LOG.debug('#class(%s)=%s', css, ip_el.text) | |
parts.append(ip_el.text) | |
elif 'style' in ip_el.attrib: | |
st = ip_el.attrib['style'] | |
if 'x' not in find_display_none_classes('.x{%s}' % st): | |
LOG.debug('#style(%s)=%s', st, ip_el.text) | |
parts.append(ip_el.text) | |
else: | |
if ip_el.text: | |
LOG.warning('??=<<%s>>' % ip_el.text) | |
if ip_el.tail: | |
parts.append(ip_el.tail) | |
if not parts: | |
LOG.warning('Your TD contained no IP parts') | |
return | |
ip_addr = u''.join(parts) | |
if not re.match(r'\d+\.\d+\.\d+\.\d+', ip_addr): | |
raise ValueError('That does not appear to be an IP: %s' % ip_addr) | |
ip_port = '%s:%d' % (ip_addr, port) | |
LOG.debug('ip=%s', ip_port) | |
return ip_port, kind, level | |
def run_body(body): | |
""" | |
Enumerate and print the proxies found in `body`. | |
:type body: unicode | |
""" | |
# dom: :type: xml.dom.minidom.Document | |
# but using it means giving up .find and friends | |
#: :type: xml.etree.ElementTree.Element | |
html_el = html5lib.parse(body, namespaceHTMLElements=False) | |
#: :type: xml.etree.ElementTree.Element | |
tab = html_el.find('.//*[@id="listable"]') | |
#: :type: list[xml.etree.ElementTree.Element] | |
tr_nl = tab.findall('.//tr[@rel]') | |
for tr_el in tr_nl: | |
info = extract_proxy_info(tr_el) | |
if not info: | |
continue | |
ip = info[0] | |
proxy_kind = info[1] | |
anon_level = info[2] | |
print('%s\t%s\t%s' % (ip, proxy_kind, anon_level)) | |
def main(argv): | |
from getopt import getopt | |
opts, args = getopt(argv[1:], 'v', ['verbose']) | |
is_verbose = ('-v', '') in opts or ('--verbose', '') in opts | |
if is_verbose: | |
LOG.setLevel(logging.DEBUG) | |
filename = args[0] | |
if '-' == filename: | |
body = sys.stdin.read().decode('utf-8') | |
else: | |
with open(filename) as fh: | |
body = fh.read().decode('utf-8') | |
run_body(body) | |
if __name__ == '__main__': | |
main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment