Created
June 2, 2011 14:35
-
-
Save rmax/1004559 to your computer and use it in GitHub Desktop.
using scrapy without scrapy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
python parse_urls.py http://somesite/foo/ ".pdf\$" | |
""" | |
import sys | |
import urllib2 | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
from scrapy.http import HtmlResponse | |
def unescape(s): | |
s = s.replace('\\\\', '\\') | |
return s | |
def main(argv): | |
url = argv[1] | |
regexes = map(unescape, argv[2:]) | |
# fetch html content using simple `urlopen` | |
body = urllib2.urlopen(url).read() | |
# wrap body into HtmlResponse in order to use SgmlLinkExtractor | |
response = HtmlResponse(url, body=body) | |
# instance extractor with given (if any) regexes | |
# and extract links | |
lx = SgmlLinkExtractor(allow=regexes) | |
for link in lx.extract_links(response): | |
print link.url | |
if __name__ == '__main__': | |
if sys.argv[1:]: | |
main(sys.argv) | |
else: | |
print "Usage: python {0} <url> [regex] [regex] ...".format(*sys.argv) | |
sys.exit(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment