Created
May 25, 2012 02:02
-
-
Save huogerac/2785367 to your computer and use it in GitHub Desktop.
Dicas de aula - 24mai
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
def mostrar_args(f): | |
def _f(*args, **kwargs): | |
#print args, kwargs | |
#sys.stderr.write('###argumentos: %r, %r \n' % (args, kwargs)) | |
sys.stderr.write('###argumentos: {0}, {1}\n'.format(args, kwargs)) | |
return f(*args, **kwargs) | |
return _f | |
@mostrar_args | |
def somar(a, b): | |
return a+b | |
print 'resultado de somar(2,3):', somar(2, 3) | |
**generation function com send | |
VER o uso de yield com send | |
yield usa o conceito de corrotina em vez de subrotina | |
yield = (dar a preferência) | |
Dicas: | |
utilizar a lib httplib2 (google) para ler sites |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
#biblioteca padrao | |
from pprint import pprint | |
import re | |
import pickle | |
#biblioteca terceiros | |
import httplib2 # dic do Mark Pilgrimm: | |
from tornado import httpclient, ioloop, gen | |
URL_CATALOGO = 'http://www.novatec.com.br/catalogo.php' | |
URL_PRODUTOS = 'http://www.novatec.com.br/livros/' | |
# re.DOTALL faz funcionar com quebra de linhas ou não | |
#RE_SLUG_LIVRO = re.compile(r'href="livros/(.*?)"', re.DOTALL) | |
RE_SLUG_LIVRO = re.compile(r'href="livros/([^"/]+)', re.DOTALL) | |
#fragmento da imagem de capa | |
''' | |
javascript:AmpliarCapa('livros/oracle10g/capa_ampliada8575220632.jpg'); | |
''' | |
RE_CAPA = re.compile(r"AmpliarCapa\('livros/([^']+)") | |
def buscar_slugs(url): | |
try: | |
with open('slugs.pickle') as slugs_pickle: | |
slugs = pickle.load(slugs_pickle) | |
except (IOError, EOFError): | |
slugs = None | |
if slugs is None: | |
h = httplib2.Http('.cache') | |
resp, content = h.request(url) | |
#pprint(resp) | |
#print len(content), 'bytes lidos' | |
slugs = RE_SLUG_LIVRO.findall(content) | |
slugs = sorted(set(slugs)) | |
with open('slugs.pickle', 'wb') as slugs_pickle: | |
pickle.dump(slugs, slugs_pickle, -1) | |
return slugs | |
def buscar_produtos(slugs): | |
cliente_http = httpclient.AsyncHTTPClient() | |
pendentes = set(slugs) | |
for slug in slugs: | |
baixar(slug, cliente_http, pendentes) | |
@gen.engine | |
def baixar(slug, http_client, pendentes): | |
resp = yield gen.Task(http_client.fetch, URL_PRODUTOS+slug) | |
if resp.error: | |
print '*** Erro ao baixar ', slug | |
print '\t', resp.error | |
else: | |
print 'baixado: ', slug | |
#print '\t', resp | |
path_capa = RE_CAPA.search(resp.body) | |
path_capa = path_capa.group(1) | |
resp_capa = yield gen.Task(http_client.fetch, URL_PRODUTOS+path_capa) | |
##pendentes.add(path_capa) | |
if resp_capa.error: | |
print '*** Erro ao baixar capa', path_capa | |
print '\t', resp_capa.error | |
else: | |
nome_img = path_capa.replace('/','_') | |
with open(nome_img, 'wb') as img: | |
img.write(resp_capa.body) | |
print 'salvo: ', nome_img | |
pendentes.remove(slug) | |
if not pendentes: | |
ioloop.IOLoop.instance().stop() | |
if __name__ == '__main__': | |
slugs = buscar_slugs(URL_CATALOGO) | |
#pprint( slugs ) | |
#para testar mais rapido | |
slugs = slugs[:5] | |
buscar_produtos(slugs) | |
ioloop.IOLoop.instance().start() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment