Skip to content

Instantly share code, notes, and snippets.

@mdaniel
Created May 10, 2015 20:17
Show Gist options
  • Save mdaniel/032cae7d161912d2a2ce to your computer and use it in GitHub Desktop.
Save mdaniel/032cae7d161912d2a2ce to your computer and use it in GitHub Desktop.
Extracts the proxy list from proxylist.hidemyass.com
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, unicode_literals
__docformat__ = 'reStructureText'
import logging
import re
import sys
import html5lib
from xml.etree.ElementTree import tostring
logging.basicConfig(level=logging.NOTSET)
LOG = logging.getLogger(__name__)
LOG.setLevel(logging.INFO)
DISPLAY_NONE_RE = re.compile(r'(?m)^\.(\S+)\s*\{\s*display\s*:\s*none\s*\}')
def find_display_none_classes(css_text):
"""
Finds the class names in the given css style text
that are ``display:none``.
For example, feeding this::
thing { display:inline }
other { display:none }
will return::
['other']
:rtype: list[unicode]
"""
return DISPLAY_NONE_RE.findall(css_text)
def test_parse_css():
print(find_display_none_classes("""
.s-Aw{display:none}
.zJsC{display:inline}
.IQvs{display:none}
.yBZq{display:inline}
.CyxL{display:none}
.dt09{display:inline}
"""))
def extract_proxy_info(tr_el):
"""
Yields an (ip:port, proxy type, anon level) tuple
from the provided table row element.
:type tr_el: xml.etree.ElementTree.Element
:rtype: (unicode, unicode, unicode)
"""
port = None
kind = None
level = None
for td_num, td_el in enumerate(tr_el):
if not td_el.text:
continue
if re.match(r'(?m)\s*\d+\s*', td_el.text):
if port is not None:
raise Exception('Cannot cope with multiple ports: port=%s text=<<%s>>'
% (port, td_el.text))
port = int(re.sub(r'\s*', '', td_el.text))
elif re.match(r'HTTPS?|SOCKS[45]', td_el.text):
kind = td_el.text
elif 7 == td_num: # re.match(r'(?i)low|high')
level = td_el.text
if port is None:
raise ValueError('Expected port, found None')
# we have to grab the style's parent because ElementTree
# doesn't permit .parentNode and "dom" doesn't have .findall :-/
#: :type: list[xml.etree.ElementTree.Element]
style_parent_nl = tr_el.findall('.//*[style]')
if not style_parent_nl:
LOG.error('Expected "style" parent, found None in %s', tostring(tr_el))
return
if 1 != len(style_parent_nl):
LOG.warning('Expected only one style parent, found %d of them',
len(style_parent_nl))
style_parent = style_parent_nl[0]
#: :type: xml.etree.ElementTree.Element
sty = style_parent.find('.//style')
if sty is None:
LOG.warning('Expected "style" child, found None: %s',
tostring(style_parent))
return
bad_classes = find_display_none_classes(sty.text)
LOG.debug('style[%s] -= %s', sty.text, bad_classes)
#: :type: list[unicode]
parts = []
for ip_el in style_parent:
LOG.debug('IP_EL=((%s))', tostring(ip_el))
if 'style' == ip_el.tag:
# don't continue or you'll eat the .tail text
# which very well could contain a number or dot
pass
elif 'class' in ip_el.attrib:
sp_class = ip_el.attrib['class']
css = []
if sp_class:
css = sp_class.split(' ')
ok = reduce(lambda a, b: a and b,
[x not in bad_classes for x in css],
True)
if ok:
LOG.debug('#class(%s)=%s', css, ip_el.text)
parts.append(ip_el.text)
elif 'style' in ip_el.attrib:
st = ip_el.attrib['style']
if 'x' not in find_display_none_classes('.x{%s}' % st):
LOG.debug('#style(%s)=%s', st, ip_el.text)
parts.append(ip_el.text)
else:
if ip_el.text:
LOG.warning('??=<<%s>>' % ip_el.text)
if ip_el.tail:
parts.append(ip_el.tail)
if not parts:
LOG.warning('Your TD contained no IP parts')
return
ip_addr = u''.join(parts)
if not re.match(r'\d+\.\d+\.\d+\.\d+', ip_addr):
raise ValueError('That does not appear to be an IP: %s' % ip_addr)
ip_port = '%s:%d' % (ip_addr, port)
LOG.debug('ip=%s', ip_port)
return ip_port, kind, level
def run_body(body):
"""
Enumerate and print the proxies found in `body`.
:type body: unicode
"""
# dom: :type: xml.dom.minidom.Document
# but using it means giving up .find and friends
#: :type: xml.etree.ElementTree.Element
html_el = html5lib.parse(body, namespaceHTMLElements=False)
#: :type: xml.etree.ElementTree.Element
tab = html_el.find('.//*[@id="listable"]')
#: :type: list[xml.etree.ElementTree.Element]
tr_nl = tab.findall('.//tr[@rel]')
for tr_el in tr_nl:
info = extract_proxy_info(tr_el)
if not info:
continue
ip = info[0]
proxy_kind = info[1]
anon_level = info[2]
print('%s\t%s\t%s' % (ip, proxy_kind, anon_level))
def main(argv):
from getopt import getopt
opts, args = getopt(argv[1:], 'v', ['verbose'])
is_verbose = ('-v', '') in opts or ('--verbose', '') in opts
if is_verbose:
LOG.setLevel(logging.DEBUG)
filename = args[0]
if '-' == filename:
body = sys.stdin.read().decode('utf-8')
else:
with open(filename) as fh:
body = fh.read().decode('utf-8')
run_body(body)
if __name__ == '__main__':
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment