Skip to content

Instantly share code, notes, and snippets.

@rmax
Created February 3, 2012 20:05
Show Gist options
  • Save rmax/1732117 to your computer and use it in GitHub Desktop.
Save rmax/1732117 to your computer and use it in GitHub Desktop.
@requestGenerator
def parse_profile(self, response):
base_url = response.url
ul = UserLoader(response=response)
ul.add_xpath('name', '//h1[1]/text()')
ul.add_xpath('website', '//*[@rel="me" and @class="url"]/text()')
ul.add_xpath('location', '//*[@class="label adr"]/text()')
ul.add_value('url', base_url)
item = ul.load_item()
# scrape answers
response = yield Request(base_url + '?tab=answers')
hxs = HtmlXPathSelector(response)
answers = item['answers'] = []
for link in hxs.select('//*[@class="answer-link"]/a'):
href, title = self._get_href_text(link)
answers.append({
'title': title,
'url': urljoin(response.url, href),
})
# scrape questions
response = yield Request(base_url + '?tab=questions')
hxs = HtmlXPathSelector(response)
questions = item['questions'] = []
for link in hxs.select('//*[@class="user-questions"]//h3/a'):
href, title = self._get_href_text(link)
questions.append({
'title': title,
'url': urljoin(response.url, href)
})
# scrape tags
response = yield Request(base_url + '?tab=tags')
hxs = HtmlXPathSelector(response)
item['tags'] = hxs.select('//*[@class="user-tags"]//*[@class="post-tag"]/text()').extract()
# finally yield our item
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment