raingloom · May 30, 2016 14:32
diff --git a/getlinks.py b/getlinks.py
 import html5lib

 def filterHref( stream ):
 	return filter( lambda x: x.get("href") is not None, html5lib.html5parser.HTMLParser().parse( stream ).iter())
 if __name__ == '__main__':
 	import sys
 	import io
 	for e in filterHref( io.open( sys.argv[1], mode="r" )):
 		print( e.get("href") ) #because Python 3 is better, but 2 is smart enough not to mess up one-element tuples
diff --git a/test.sh b/test.sh
 #downloads some word lists with wget
 #cat useragent
 #Mozilla/5.0 (X11; Linux x86_64; rv:43.0) Gecko/20100101 Firefox/43.0 Iceweasel/43.0.4
 mkdir -p files
 python getlinks.py test.xml | wget --verbose --base 'http://www.aciddr0p.net' --input-file=- --directory-prefix=files -nd --show-progress --user-agent="$(cat useragent)"
	import html5lib

	def filterHref( stream ):
	return filter( lambda x: x.get("href") is not None, html5lib.html5parser.HTMLParser().parse( stream ).iter())
	if __name__ == '__main__':
	import sys
	import io
	for e in filterHref( io.open( sys.argv[1], mode="r" )):
	print( e.get("href") ) #because Python 3 is better, but 2 is smart enough not to mess up one-element tuples
	#downloads some word lists with wget
	#cat useragent
	#Mozilla/5.0 (X11; Linux x86_64; rv:43.0) Gecko/20100101 Firefox/43.0 Iceweasel/43.0.4
	mkdir -p files
	python getlinks.py test.xml \| wget --verbose --base 'http://www.aciddr0p.net' --input-file=- --directory-prefix=files -nd --show-progress --user-agent="$(cat useragent)"