Skip to content

Instantly share code, notes, and snippets.

@rmax
Created June 2, 2011 14:35
Show Gist options
  • Save rmax/1004559 to your computer and use it in GitHub Desktop.
Save rmax/1004559 to your computer and use it in GitHub Desktop.
using scrapy without scrapy
"""
python parse_urls.py http://somesite/foo/ ".pdf\$"
"""
import sys
import urllib2
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import HtmlResponse
def unescape(s):
s = s.replace('\\\\', '\\')
return s
def main(argv):
url = argv[1]
regexes = map(unescape, argv[2:])
# fetch html content using simple `urlopen`
body = urllib2.urlopen(url).read()
# wrap body into HtmlResponse in order to use SgmlLinkExtractor
response = HtmlResponse(url, body=body)
# instance extractor with given (if any) regexes
# and extract links
lx = SgmlLinkExtractor(allow=regexes)
for link in lx.extract_links(response):
print link.url
if __name__ == '__main__':
if sys.argv[1:]:
main(sys.argv)
else:
print "Usage: python {0} <url> [regex] [regex] ...".format(*sys.argv)
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment