Created
April 1, 2015 15:24
-
-
Save iKlotho/440abbd61c842f9ae8dd to your computer and use it in GitHub Desktop.
Getting data from web and store them into sqlite db
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from urllib2 import urlopen | |
import sqlite3 | |
BASE_URL = "http://www.chicagoreader.com" | |
conn = sqlite3.connect('webscrap.db') # creating database | |
c = conn.cursor() | |
c.execute('''CREATE TABLE stocks | |
(category_url text, category text, winner text, runners_up text)''') | |
def make_soup(url): # parsing html to xml | |
html = urlopen(url).read() | |
return BeautifulSoup(html, "lxml") | |
def get_category_links(section_url): | |
soup = make_soup(section_url) | |
boccat = soup.find("dl", "boccat") | |
category_links = [BASE_URL + dd.a["href"] for dd in boccat.findAll("dd")] | |
return category_links | |
def get_category_winner(category_url): | |
soup = make_soup(category_url) | |
category = soup.find("h1", "headline").string | |
winner = [h2.string for h2 in soup.findAll("h2", "boc1")] | |
runners_up = [h2.string for h2 in soup.findAll("h2", "boc2")] | |
data = [(category_url,category,winner[0],runners_up[0])] | |
c.executemany('INSERT INTO stocks VALUES (?,?,?,?)', data) | |
if __name__ == '__main__': | |
food_n_drink = ("http://www.chicagoreader.com/chicago/" | |
"best-of-chicago-2011-food-drink/BestOf?oid=4106228") | |
categories = get_category_links(food_n_drink) | |
a = 0 | |
for category in categories: | |
if a == 50: | |
conn.commit() | |
conn.close() | |
exit() | |
get_category_winner(category) | |
a += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment