Created
May 4, 2015 00:52
-
-
Save DiegoQueiroz/e03873b33208b5356849 to your computer and use it in GitHub Desktop.
Override HTTP redirect behavior to CNPq Lattes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import sys, re | |
if sys.version_info.major == 3: | |
# usando Python 3 | |
from urllib.request import Request, build_opener, HTTPRedirectHandler, HTTPCookieProcessor | |
else: | |
# usando Python 2 | |
from urllib2 import Request, build_opener, HTTPRedirectHandler, HTTPCookieProcessor | |
# as solicitações ao Lattes fazem uso de diversos redirecionamentos HTTP internos | |
# essa classe altera o comportamento do redirect quando ele possuir o campo | |
# "metodo=apresentar" para contornar a necessidade do captcha | |
class LattesHTTPRedirectHandler(HTTPRedirectHandler): | |
def redirect_request(self, req, fp, code, msg, headers, newurl): | |
newurl = re.sub('(?<=[?&]metodo=)apresentar', 'captchaValido', newurl) | |
return super(LattesHTTPRedirectHandler, self).redirect_request(req, fp, code, msg, headers, newurl) | |
if __name__ == "__main__": | |
txdata = None | |
txheaders = { | |
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0) Gecko/20100101 Firefox/4.0', | |
# | |
# *********** pelo que pude identificar, as linhas que comentei abaixo são | |
# *********** desnecessárias para esse script funcionar | |
# *********** As cookies em particular, só confundem o servidor. Sugiro remover. | |
# | |
#'Accept-Language': 'en-us,en;q=0.5', | |
#'Accept-Encoding': 'deflate', | |
#'Keep-Alive': '115', | |
#'Connection': 'keep-alive', | |
#'Cache-Control': 'max-age=0', | |
#'Cookie': 'style=standard; __utma=140185953.294397416.1313390179.1313390179.1317145115.2; __utmz=140185953.1317145115.2.2.utmccn=(referral)|utmcsr=emailinstitucional.cnpq.br|utmcct=/ei/emailInstitucional.do|utmcmd=referral; JSESSIONID=1B98ABF9642E01597AABA0F7A8807FD1.node2', | |
} | |
url = 'http://lattes.cnpq.br/4727357182510680' | |
req = Request(url, txdata, txheaders) | |
# para funcionar direito, também é necessário ativar o HTTPCookieProcessor, | |
# ou as cookies são perdidas entre os redirects e o script se torna instável | |
# notar que o build_opener recebe INSTÂNCIAS de classes | |
lattesOpener = build_opener(LattesHTTPRedirectHandler(), HTTPCookieProcessor()) | |
arquivoH = lattesOpener.open(req) | |
cvLattesHTML = arquivoH.read() | |
arquivoH.close() | |
print(cvLattesHTML) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment