Last active
June 13, 2021 01:58
-
-
Save phoemur/4f05bd9900578e48a76d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import re | |
import urllib.request | |
import urllib.parse | |
import http.cookiejar | |
import os | |
from lxml.html import fragment_fromstring | |
from collections import OrderedDict | |
def remove_disallowed_filename_chars(filename): | |
corrected_file = "".join([x if x.isalnum() else "_" for x in filename]) | |
while "__" in corrected_file: | |
corrected_file = corrected_file.replace("__", "_") | |
return corrected_file | |
def get_lista(*args, **kwargs): | |
''' | |
Retorna todos os FII listados na BVMF | |
''' | |
url = 'http://www2.bmfbovespa.com.br/Fundos-Listados/FundosListados.aspx?tipoFundo=imobiliario&Idioma=pt-br' | |
cj = http.cookiejar.CookieJar() | |
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) | |
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'), | |
('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01')] | |
with opener.open(url) as ur: | |
content = ur.read().decode('UTF-8') | |
pattern = re.compile('<table>.*</table>', re.DOTALL) | |
reg = re.findall(pattern, content)[0] | |
page = fragment_fromstring(reg) | |
lista = OrderedDict() | |
for row in page.xpath('tr'): | |
lista.update({row.findall('td')[3].getchildren()[0].text : 'http://www2.bmfbovespa.com.br/Fundos-Listados/' + row.findall('td')[0].getchildren()[0].items()[1][1]}) | |
return lista | |
def get_files(fii, link): | |
''' | |
Faz o Download de todos os arquivos disponiveis do FII selecionado | |
''' | |
COMUNICADOS = 'http://www2.bmfbovespa.com.br/Fundos-Listados/FundosListadosDetalhe.aspx?Sigla={}&tipoFundo=Imobiliario&aba=abaPrincipal'.format(fii.upper()) | |
RELATORIOS = 'http://www2.bmfbovespa.com.br/Fundos-Listados/FundosListadosDetalhe.aspx?Sigla={}&tipoFundo=Imobiliario&aba=subAbaDemonstracoesFinanceiras&idioma=pt-br'.format(fii.upper()) | |
OUTROS = 'http://www2.bmfbovespa.com.br/Fundos-Listados/FundosListadosDetalhe.aspx?Sigla={}&tipoFundo=Imobiliario&aba=subAbaOutrosDocumentos&idioma=pt-br'.format(fii.upper()) | |
os.makedirs(fii, exist_ok=True) | |
os.chdir(fii) | |
cj = http.cookiejar.CookieJar() | |
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) | |
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'), | |
('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01')] | |
for pagina in [COMUNICADOS, RELATORIOS, OUTROS]: | |
with opener.open(pagina) as ur: | |
content = ur.read().decode('UTF-8') | |
pattern = re.compile('tbArqListados.*(<table>.*</table>)', re.DOTALL) | |
reg = re.findall(pattern, content)[0] | |
page = fragment_fromstring(reg) | |
lista = list() | |
for row in page.xpath('tr'): | |
lista.append((remove_disallowed_filename_chars(row.findall('td')[0].getchildren()[0].text) + '.PDF' , | |
row.findall('td')[0].getchildren()[0].items()[2][1])) | |
for filename, link in lista: | |
if filename not in os.listdir('.'): | |
print('Downloading {}'.format(filename)) | |
with urllib.request.urlopen(link.replace('http://www.bmfbovespa', 'http://www2.bmfbovespa')) as ur: | |
content = ur.read() | |
with open(filename, mode='wb') as fh: | |
fh.write(content) | |
else: | |
print('{} já está salvo no diretório'.format(filename)) | |
os.chdir('..') | |
if __name__ == '__main__': | |
import sys | |
if len(sys.argv) == 1 or sys.argv[1] in {'-h', '--help'}: | |
print('Modo de uso: {0} "FII[1]" "FII[2]" ... "FII[N]"'.format(sys.argv[0])) | |
sys.exit(1) | |
lista = get_lista() | |
for arg in sys.argv[1:]: | |
for fii, link in lista.items(): | |
if re.search(fii, arg.upper()): | |
get_files(fii, link) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Este script faz o download de todos os arquivos disponíveis sobre determinado Fundo de Investimento Imobiliário listado na BVMF.
Modo de uso: python fii.py FII[1] FII[2] ... FII[N]
Se o arquivo já estiver salvo ele não faz o download, só baixa os arquivos novos. Necessita da dependência lxml instalada:
pip install lxml