Last active
May 30, 2016 14:32
-
-
Save raingloom/19f62b3aca431d01d8fbac1143314de2 to your computer and use it in GitHub Desktop.
python 2 href extractor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import html5lib | |
def filterHref( stream ): | |
return filter( lambda x: x.get("href") is not None, html5lib.html5parser.HTMLParser().parse( stream ).iter()) | |
if __name__ == '__main__': | |
import sys | |
import io | |
for e in filterHref( io.open( sys.argv[1], mode="r" )): | |
print( e.get("href") ) #because Python 3 is better, but 2 is smart enough not to mess up one-element tuples |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#downloads some word lists with wget | |
#cat useragent | |
#Mozilla/5.0 (X11; Linux x86_64; rv:43.0) Gecko/20100101 Firefox/43.0 Iceweasel/43.0.4 | |
mkdir -p files | |
python getlinks.py test.xml | wget --verbose --base 'http://www.aciddr0p.net' --input-file=- --directory-prefix=files -nd --show-progress --user-agent="$(cat useragent)" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment