Skip to content

Instantly share code, notes, and snippets.

@automata
Created April 20, 2012 23:27
Show Gist options
  • Save automata/2432637 to your computer and use it in GitHub Desktop.
Save automata/2432637 to your computer and use it in GitHub Desktop.
simple scraper
# -*- coding: utf-8 -*-
import urllib
import urllib2
import string
import sys
from bs4 import BeautifulSoup
# faking a browser
user_agent = "Mozilla/5.0 (Linux x86_64) Gecko/20120324 Firefox/14.0a1"
headers = { 'User-Agent' : user_agent }
# making the HTTP request
data = None
request = urllib2.Request("http://en.wikipedia.org/wiki/Music", data, headers)
response = urllib2.urlopen(request)
# reading the response from HTTP GET and parsing with beautifulsoup
htmlpage = response.read()
soup = BeautifulSoup(htmlpage)
# we want just the <p> elements
ps = soup.find_all("p")
# printing just the text inside <p> elements already found
for p in ps:
print p.text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment