Skip to content

Instantly share code, notes, and snippets.

@tazjel
Created July 5, 2013 06:10
Show Gist options
  • Save tazjel/5932302 to your computer and use it in GitHub Desktop.
Save tazjel/5932302 to your computer and use it in GitHub Desktop.
from datetime import datetime
import BeautifulSoup as soup
import requests
host = 'http://news.ycombinator.com'
def chunks(l, n):
""" Yield successive n-sized chunks from l.
"""
for i in xrange(0, len(l), n):
yield l[i:i+n]
def get_news(depth):
def _get_news(page, acc, depth):
if not page.startswith('/'):
page = '/' + page
r = requests.get(host + page)
doc = soup.BeautifulSoup(r.content)
titles = doc.table.findAll(True, {'class': 'title'})
articles = titles[:-1]
for no, title in chunks(articles, 2):
acc.append( (title.a.string, title.a['href']) )
if depth > 1:
more = titles[-1]
return _get_news(more.a['href'], acc, depth - 1)
else:
return acc
return _get_news('/', [], depth)
def main():
print 'Hacker News Scrap'
print 'generated at %s' % datetime.now().strftime('%a, %d %b %Y %H:%M:%S')
print
for no,(title, href) in enumerate(get_news(5)):
print '%d. %s (%s)' % (no+1, title, href)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment