rmax · June 2, 2011 14:35
diff --git a/parse_urls.py b/parse_urls.py
 """
 python parse_urls.py http://somesite/foo/ ".pdf\$"
 """

 import sys
 import urllib2

 from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 from scrapy.http import HtmlResponse

 def unescape(s):
    s = s.replace('\\\\', '\\')
    return s

 def main(argv):
    url = argv[1]
    regexes = map(unescape, argv[2:])

    # fetch html content using simple `urlopen`
    body = urllib2.urlopen(url).read()

    # wrap body into HtmlResponse in order to use SgmlLinkExtractor
    response = HtmlResponse(url, body=body)

    # instance extractor with given (if any) regexes
    # and extract links
    lx = SgmlLinkExtractor(allow=regexes)
    for link in lx.extract_links(response):
        print link.url


 if __name__ == '__main__':
    if sys.argv[1:]:
        main(sys.argv)
    else:
        print "Usage: python {0} <url> [regex] [regex] ...".format(*sys.argv)
        sys.exit(1)
	"""
	python parse_urls.py http://somesite/foo/ ".pdf\$"
	"""

	import sys
	import urllib2

	from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
	from scrapy.http import HtmlResponse

	def unescape(s):
	s = s.replace('\\\\', '\\')
	return s

	def main(argv):
	url = argv[1]
	regexes = map(unescape, argv[2:])

	# fetch html content using simple `urlopen`
	body = urllib2.urlopen(url).read()

	# wrap body into HtmlResponse in order to use SgmlLinkExtractor
	response = HtmlResponse(url, body=body)

	# instance extractor with given (if any) regexes
	# and extract links
	lx = SgmlLinkExtractor(allow=regexes)
	for link in lx.extract_links(response):
	print link.url


	if __name__ == '__main__':
	if sys.argv[1:]:
	main(sys.argv)
	else:
	print "Usage: python {0} <url> [regex] [regex] ...".format(*sys.argv)
	sys.exit(1)