Skip to content

Instantly share code, notes, and snippets.

@domluna
Created September 14, 2015 01:46
Show Gist options
  • Select an option

  • Save domluna/dbb321accd7ae921d48a to your computer and use it in GitHub Desktop.

Select an option

Save domluna/dbb321accd7ae921d48a to your computer and use it in GitHub Desktop.
HN comment scraper
from __future__ import print_function
from scrapy.selector import Selector
import requests
def scrape_comments(url):
comments = []
r = requests.get(url).text
sel = Selector(text=r)
rough_comments = sel.xpath("//span[@class='comment']/span")
for c in rough_comments:
# For some reason the initial paragraph
# of a comment isn't inside a <p> tag
first_paragraph = c.xpath('./text()')
# The following paragraph are in a <p> tag
rest = c.xpath('./p/text()')
# loop here just in case it's more
# than 1 paragraph.
for pg in first_paragraph:
comments.append(pg.extract())
for pg in rest:
comments.append(pg.extract())
return comments
if __name__ == '__main__':
comments = scrape_comments('https://news.ycombinator.com/item?id=10212770')
print(comments, len(comments))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment