Skip to content

Instantly share code, notes, and snippets.

@joffilyfe
Created June 12, 2015 01:37
Show Gist options
  • Save joffilyfe/f2d7c420eee64279992e to your computer and use it in GitHub Desktop.
Save joffilyfe/f2d7c420eee64279992e to your computer and use it in GitHub Desktop.
Vislumbre de um crawler para o lattes.
# -*- coding: utf-8 -*-
import mechanize
import cookielib
from lxml import html
from lxml import etree
HEADERS = [('Accept-Language', 'en-us,en;q=0.5'),
('Accept-Encoding', 'deflate'),
('Keep-Alive', '115'),
('Connection', 'keep-alive'),
('Cache-Control', 'max-age=0'),
('Host', 'buscatextual.cnpq.br'),
('Origin', 'http,//buscatextual.cnpq.br'),
('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'),
]
def get_lattes(id):
url = 'http://lattes.cnpq.br/'+id
br = mechanize.Browser()
br.set_cookiejar(cookielib.LWPCookieJar())
br.set_handle_equiv(True)
# br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = HEADERS
r = br.open(url)
response = r.read()
if 'infpessoa' in response:
return response
br.select_form(nr=0)
br.form.set_all_readonly(False)
br.form['metodo'] = 'visualizarCV'
r = br.submit()
return r.read()
def name(text):
page = html.fromstring(text)
path = "/html/body/div[1]/div[3]/div/div/div/div[3]/div/div[2]/div/text()"
return page.xpath(path)
def addressProfessional(text):
page = html.fromstring(text)
path = "/html/body/div[1]/div[3]/div/div/div/div[4]/div/div[2]/div/text()"
return page.xpath(path)
def bibliography(text):
page = html.fromstring(text)
#path = "/html/body/div[1]/div[3]/div/div/div/div[17]/div/div[1]/b/text()"
path = '//div[@class="cita-artigos"]'
bibliography = page.xpath(path)
for n in bibliography:
# print(type(n))
# print n.getparent().xpath('div[@class="artigo-completo"]/div/div/span/text()')
print n.getparent().xpath('div[@class="layout-cell layout-cell-11"]/div/text()')
# response = get_lattes("2526900454904947")
response = get_lattes("5845450440379554")
# response = get_lattes("7992586996463223")
print name(response)
bibliography(response)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment