Created
November 20, 2012 00:26
-
-
Save Garciat/4115134 to your computer and use it in GitHub Desktop.
Scraper para cinecenter.com.bo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from urllib2 import Request, urlopen | |
from bs4 import BeautifulSoup as Soup | |
class Scraper(object): | |
regions = dict( | |
lapaz = 'La Paz', | |
cochabamba = 'Cochabamba', | |
santacruz = 'Santa Cruz' | |
) | |
base_url = 'http://cinecenter.com.bo' | |
def __init__(self, region): | |
if region not in self.regions: | |
raise ValueError('region does not exist') | |
self.region = region | |
self.session = None | |
def init_session(self): | |
response = urlopen('%s/%s.html' % (self.base_url, self.region)) | |
headers = dict(response.info()) | |
assert 'set-cookie' in headers, 'no cookie header' | |
header = headers['set-cookie'] | |
cname, cvalue = header.split(';')[0].strip().split('=') | |
assert cname.strip().upper() == 'PHPSESSID', 'nope?' | |
self.session = cvalue | |
assert self.session is not None, 'session cookie could not be fetched' | |
def session_request(self, *args, **kwargs): | |
if self.session is None: | |
self.init_session() | |
request = Request(*args, **kwargs) | |
cookie = 'PHPSESSID=' + self.session | |
if 'Cookie' in request.headers: | |
cookie = '%s; %s' % (request.headers['Cookie'], cookie) | |
request.add_header('Cookie', cookie) | |
return request | |
def scrape(self): | |
request = self.session_request(self.base_url + '/index.php?accion=getCartelera') | |
response = urlopen(request) | |
dom = Soup(response.read()) | |
title_re = re.compile('^PELICULA') | |
for movie in dom('td', class_ = 'over_modul'): | |
img = movie.find('img', title = title_re) | |
title = img['title'].split('PELICULA:', 1)[-1].strip() | |
horarios = movie.find('div', class_ = 'div_hint').get_text().strip().split('\n') | |
normal = horarios[1] | |
tresd = horarios[3] | |
vip = horarios[5] | |
print title.encode('utf-8') | |
print 'Normal', normal | |
print '3D', tresd | |
if __name__ == '__main__': | |
Scraper('santacruz').scrape() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment