phoemur · June 13, 2021 01:58 · felipemaion · Mar 6, 2021
diff --git a/fii.py b/fii.py
 #!/usr/bin/env python3

 import re
 import urllib.request
 import urllib.parse
 import http.cookiejar
 import os

 from lxml.html import fragment_fromstring
 from collections import OrderedDict

 def remove_disallowed_filename_chars(filename):
    corrected_file = "".join([x if x.isalnum() else "_" for x in filename])
    while "__" in corrected_file:
        corrected_file = corrected_file.replace("__", "_")
    return corrected_file

 def get_lista(*args, **kwargs):
    '''
    Retorna todos os FII listados na BVMF
    '''
    url = 'http://www2.bmfbovespa.com.br/Fundos-Listados/FundosListados.aspx?tipoFundo=imobiliario&Idioma=pt-br'
    cj = http.cookiejar.CookieJar()
    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'),
                         ('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01')]
    
    with opener.open(url) as ur:
        content = ur.read().decode('UTF-8')
        
    pattern = re.compile('<table>.*</table>', re.DOTALL)
    reg = re.findall(pattern, content)[0]
    page = fragment_fromstring(reg)
    lista = OrderedDict()
    
    for row in page.xpath('tr'):
        lista.update({row.findall('td')[3].getchildren()[0].text : 'http://www2.bmfbovespa.com.br/Fundos-Listados/' + row.findall('td')[0].getchildren()[0].items()[1][1]})
        
    return lista

 def get_files(fii, link):
    '''
    Faz o Download de todos os arquivos disponiveis do FII selecionado
    '''
    COMUNICADOS = 'http://www2.bmfbovespa.com.br/Fundos-Listados/FundosListadosDetalhe.aspx?Sigla={}&tipoFundo=Imobiliario&aba=abaPrincipal'.format(fii.upper())
    RELATORIOS = 'http://www2.bmfbovespa.com.br/Fundos-Listados/FundosListadosDetalhe.aspx?Sigla={}&tipoFundo=Imobiliario&aba=subAbaDemonstracoesFinanceiras&idioma=pt-br'.format(fii.upper())
    OUTROS = 'http://www2.bmfbovespa.com.br/Fundos-Listados/FundosListadosDetalhe.aspx?Sigla={}&tipoFundo=Imobiliario&aba=subAbaOutrosDocumentos&idioma=pt-br'.format(fii.upper())

    os.makedirs(fii, exist_ok=True)
    os.chdir(fii)
    cj = http.cookiejar.CookieJar()
    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'),
                         ('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01')]
    
    
    for pagina in [COMUNICADOS, RELATORIOS, OUTROS]:
        with opener.open(pagina) as ur:
            content = ur.read().decode('UTF-8')
        
        pattern = re.compile('tbArqListados.*(<table>.*</table>)', re.DOTALL)   
        reg = re.findall(pattern, content)[0]
        page = fragment_fromstring(reg)
        lista = list()
    
        for row in page.xpath('tr'):
            lista.append((remove_disallowed_filename_chars(row.findall('td')[0].getchildren()[0].text) + '.PDF' ,
                          row.findall('td')[0].getchildren()[0].items()[2][1]))
        
    
        for filename, link in lista:
            if filename not in os.listdir('.'):
                print('Downloading {}'.format(filename))
                with urllib.request.urlopen(link.replace('http://www.bmfbovespa', 'http://www2.bmfbovespa')) as ur:
                    content = ur.read()
                    with open(filename, mode='wb') as fh:
                        fh.write(content)
            else:
                print('{} já está salvo no diretório'.format(filename))

    os.chdir('..')

 if __name__ == '__main__':
    import sys
    
    if len(sys.argv) == 1 or sys.argv[1] in {'-h', '--help'}:
        print('Modo de uso: {0} "FII[1]" "FII[2]" ... "FII[N]"'.format(sys.argv[0]))
        sys.exit(1)
    
    lista = get_lista()
    
    for arg in sys.argv[1:]:
        for fii, link in lista.items():
            if re.search(fii, arg.upper()):
                get_files(fii, link)
	#!/usr/bin/env python3

	import re
	import urllib.request
	import urllib.parse
	import http.cookiejar
	import os

	from lxml.html import fragment_fromstring
	from collections import OrderedDict

	def remove_disallowed_filename_chars(filename):
	corrected_file = "".join([x if x.isalnum() else "_" for x in filename])
	while "__" in corrected_file:
	corrected_file = corrected_file.replace("__", "_")
	return corrected_file

	def get_lista(args, *kwargs):
	'''
	Retorna todos os FII listados na BVMF
	'''
	url = 'http://www2.bmfbovespa.com.br/Fundos-Listados/FundosListados.aspx?tipoFundo=imobiliario&Idioma=pt-br'
	cj = http.cookiejar.CookieJar()
	opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
	opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'),
	('Accept', 'text/html, text/plain, text/css, text/sgml, /;q=0.01')]

	with opener.open(url) as ur:
	content = ur.read().decode('UTF-8')

	pattern = re.compile('<table>.*</table>', re.DOTALL)
	reg = re.findall(pattern, content)[0]
	page = fragment_fromstring(reg)
	lista = OrderedDict()

	for row in page.xpath('tr'):
	lista.update({row.findall('td')[3].getchildren()[0].text : 'http://www2.bmfbovespa.com.br/Fundos-Listados/' + row.findall('td')[0].getchildren()[0].items()[1][1]})

	return lista

	def get_files(fii, link):
	'''
	Faz o Download de todos os arquivos disponiveis do FII selecionado
	'''
	COMUNICADOS = 'http://www2.bmfbovespa.com.br/Fundos-Listados/FundosListadosDetalhe.aspx?Sigla={}&tipoFundo=Imobiliario&aba=abaPrincipal'.format(fii.upper())
	RELATORIOS = 'http://www2.bmfbovespa.com.br/Fundos-Listados/FundosListadosDetalhe.aspx?Sigla={}&tipoFundo=Imobiliario&aba=subAbaDemonstracoesFinanceiras&idioma=pt-br'.format(fii.upper())
	OUTROS = 'http://www2.bmfbovespa.com.br/Fundos-Listados/FundosListadosDetalhe.aspx?Sigla={}&tipoFundo=Imobiliario&aba=subAbaOutrosDocumentos&idioma=pt-br'.format(fii.upper())

	os.makedirs(fii, exist_ok=True)
	os.chdir(fii)
	cj = http.cookiejar.CookieJar()
	opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
	opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'),
	('Accept', 'text/html, text/plain, text/css, text/sgml, /;q=0.01')]


	for pagina in [COMUNICADOS, RELATORIOS, OUTROS]:
	with opener.open(pagina) as ur:
	content = ur.read().decode('UTF-8')

	pattern = re.compile('tbArqListados.(<table>.</table>)', re.DOTALL)
	reg = re.findall(pattern, content)[0]
	page = fragment_fromstring(reg)
	lista = list()

	for row in page.xpath('tr'):
	lista.append((remove_disallowed_filename_chars(row.findall('td')[0].getchildren()[0].text) + '.PDF' ,
	row.findall('td')[0].getchildren()[0].items()[2][1]))


	for filename, link in lista:
	if filename not in os.listdir('.'):
	print('Downloading {}'.format(filename))
	with urllib.request.urlopen(link.replace('http://www.bmfbovespa', 'http://www2.bmfbovespa')) as ur:
	content = ur.read()
	with open(filename, mode='wb') as fh:
	fh.write(content)
	else:
	print('{} já está salvo no diretório'.format(filename))

	os.chdir('..')

	if __name__ == '__main__':
	import sys

	if len(sys.argv) == 1 or sys.argv[1] in {'-h', '--help'}:
	print('Modo de uso: {0} "FII[1]" "FII[2]" ... "FII[N]"'.format(sys.argv[0]))
	sys.exit(1)

	lista = get_lista()

	for arg in sys.argv[1:]:
	for fii, link in lista.items():
	if re.search(fii, arg.upper()):
	get_files(fii, link)