joffilyfe · June 12, 2015 01:37
diff --git a/start.py b/start.py
 # -*- coding: utf-8 -*-
 import mechanize
 import cookielib
 from lxml import html
 from lxml import etree

 HEADERS =  [('Accept-Language', 'en-us,en;q=0.5'),
  ('Accept-Encoding', 'deflate'),
  ('Keep-Alive', '115'),
  ('Connection', 'keep-alive'),
  ('Cache-Control', 'max-age=0'),
  ('Host', 'buscatextual.cnpq.br'),
  ('Origin', 'http,//buscatextual.cnpq.br'),
  ('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'),
 ]

 def get_lattes(id):
  url = 'http://lattes.cnpq.br/'+id

  br = mechanize.Browser()
  br.set_cookiejar(cookielib.LWPCookieJar())

  br.set_handle_equiv(True)
  # br.set_handle_gzip(True)
  br.set_handle_redirect(True)
  br.set_handle_referer(True)
  br.set_handle_robots(False)
  br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
  br.addheaders = HEADERS

  r = br.open(url)
  response = r.read()
  if 'infpessoa' in response:
    return response

  br.select_form(nr=0)
  br.form.set_all_readonly(False)
  br.form['metodo'] = 'visualizarCV'  
  r = br.submit()
  return r.read()


 def name(text):
  page = html.fromstring(text)
  path = "/html/body/div[1]/div[3]/div/div/div/div[3]/div/div[2]/div/text()"
  return page.xpath(path)

 def addressProfessional(text):
  page = html.fromstring(text)
  path = "/html/body/div[1]/div[3]/div/div/div/div[4]/div/div[2]/div/text()"
  return page.xpath(path)

 def bibliography(text):
  page = html.fromstring(text)
  #path = "/html/body/div[1]/div[3]/div/div/div/div[17]/div/div[1]/b/text()"
  path = '//div[@class="cita-artigos"]'
  bibliography = page.xpath(path)

  for n in bibliography:
    # print(type(n))
    # print n.getparent().xpath('div[@class="artigo-completo"]/div/div/span/text()')
    print n.getparent().xpath('div[@class="layout-cell layout-cell-11"]/div/text()')


 # response = get_lattes("2526900454904947")
 response = get_lattes("5845450440379554")
 # response = get_lattes("7992586996463223")


 print name(response)
 bibliography(response)
	# -- coding: utf-8 --
	import mechanize
	import cookielib
	from lxml import html
	from lxml import etree

	HEADERS = [('Accept-Language', 'en-us,en;q=0.5'),
	('Accept-Encoding', 'deflate'),
	('Keep-Alive', '115'),
	('Connection', 'keep-alive'),
	('Cache-Control', 'max-age=0'),
	('Host', 'buscatextual.cnpq.br'),
	('Origin', 'http,//buscatextual.cnpq.br'),
	('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'),
	]

	def get_lattes(id):
	url = 'http://lattes.cnpq.br/'+id

	br = mechanize.Browser()
	br.set_cookiejar(cookielib.LWPCookieJar())

	br.set_handle_equiv(True)
	# br.set_handle_gzip(True)
	br.set_handle_redirect(True)
	br.set_handle_referer(True)
	br.set_handle_robots(False)
	br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
	br.addheaders = HEADERS

	r = br.open(url)
	response = r.read()
	if 'infpessoa' in response:
	return response

	br.select_form(nr=0)
	br.form.set_all_readonly(False)
	br.form['metodo'] = 'visualizarCV'
	r = br.submit()
	return r.read()


	def name(text):
	page = html.fromstring(text)
	path = "/html/body/div[1]/div[3]/div/div/div/div[3]/div/div[2]/div/text()"
	return page.xpath(path)

	def addressProfessional(text):
	page = html.fromstring(text)
	path = "/html/body/div[1]/div[3]/div/div/div/div[4]/div/div[2]/div/text()"
	return page.xpath(path)

	def bibliography(text):
	page = html.fromstring(text)
	#path = "/html/body/div[1]/div[3]/div/div/div/div[17]/div/div[1]/b/text()"
	path = '//div[@class="cita-artigos"]'
	bibliography = page.xpath(path)

	for n in bibliography:
	# print(type(n))
	# print n.getparent().xpath('div[@class="artigo-completo"]/div/div/span/text()')
	print n.getparent().xpath('div[@class="layout-cell layout-cell-11"]/div/text()')


	# response = get_lattes("2526900454904947")
	response = get_lattes("5845450440379554")
	# response = get_lattes("7992586996463223")


	print name(response)
	bibliography(response)