Skip to content

Instantly share code, notes, and snippets.

@tylerwillingham
Created June 14, 2014 18:40
Show Gist options
  • Save tylerwillingham/5f01bbbeb998f2c2be89 to your computer and use it in GitHub Desktop.
Save tylerwillingham/5f01bbbeb998f2c2be89 to your computer and use it in GitHub Desktop.
Retrieves story titles and urls from bimmerpost.com
from bs4 import BeautifulSoup
import requests
story_qty = int(raw_input('How many stories do you want to get? '))
page = 1
story_count = 0
stories = []
# Get the data to filter for a given page
def getPageData():
global page
if page == 1 or page < 1:
url = 'http://bimmerpost.com/'
else:
url = 'http://bimmerpost.com/page/' + str(page)
page += 1
return BeautifulSoup(requests.get(url).text)
# get the stories within this set of data
def parseStories(data):
global story_count
global story_qty
global stories
for story in data.select('h2 > a'):
story_count += 1
stories.append({'title': story.text, 'url': story['href']})
# Get the stories
def getStories():
global story_count
global story_qty
while (story_count < story_qty):
parseStories(getPageData())
# Trim the fat
del stories[story_qty:]
if len(stories) <= story_qty:
return True
else:
return False
################################################################################
list_of_stories = getStories()
if list_of_stories:
# Print all of the titles
for story in stories:
print story['title'] + ' => ' + story['url']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment