Skip to content

Instantly share code, notes, and snippets.

@asmedrano
Created October 24, 2012 17:52
Show Gist options
  • Select an option

  • Save asmedrano/3947679 to your computer and use it in GitHub Desktop.

Select an option

Save asmedrano/3947679 to your computer and use it in GitHub Desktop.
prettifythis
class ScruffySpider(BaseSpider):
name = "scruffy"
allowed_domains = ['constantcontact.com']
start_urls = [
'http://www.constantcontact.com/index.jsp'
]
domain_pattern = re.compile('constantcontact\.com')
mailto_pattern = re.compile('mailto\:')
visited_pages = []
def parse(self, response):
hcs = HtmlCSSSelector(response)
links = hcs.select('a')
items = []
for link in links:
url = link.get('href')
if url is not None:
if url[0] == '/':
url = 'http://www.constantcontact.com' + url
if url != '#' and url is not None and self.domain_pattern.search(url) != None and self.mailto_pattern.search(url) == None and url in self.visited_pages == False:
item = ScruffyItem()
item["url"] = url
items.append(item)
print url
return items
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment