Skip to content

Instantly share code, notes, and snippets.

@Luxter77
Last active December 4, 2020 18:36
Show Gist options
  • Save Luxter77/c1383e465ca47e0e112a4d817e874e20 to your computer and use it in GitHub Desktop.
Save Luxter77/c1383e465ca47e0e112a4d817e874e20 to your computer and use it in GitHub Desktop.
#/usr/bin/env python
import requests
import os
import re
from typing import List
from bs4 import BeautifulSoup
contraloria_py = 'https://djbpublico.contraloria.gov.py/'
URList = List[str]
def contraloria_get_url(page: str = contraloria_py) -> URList:
'''
Dada la pagina web de la contraloria publica del paraguay:
Obtener lista de URLs que apunta a los PDFs
'''
s = requests.Session()
page_content = s.get(page).content
page_soup = BeautifulSoup(page_content, 'html.parser')
imput_list = page_soup.findAll("input", {'type': 'hidden', 'value': '1'})
magic_number = imput_list[0].get('name')
post_data = {magic_number: '1', 'limit': '0'}
URLs = BeautifulSoup(s.post(page, data=post_data).content, 'html.parser')
urlist = []
for btn in (URLs.findAll('a', {'class': 'btn btn-success'})):
urlist.append(page + btn.get('href')[1:])
print(f'Obtenidos: {str(len(urlist))} URLs')
return(urlist)
def contraloria_download_pdfs(urlist: URList, targetDir: str) -> None:
'Dada una lista de URLs, descargar los PDF'
s = requests.Session()
URList_len = len(urlist)
print(f'Bajando {str(URList_len)} pdfs:')
for i, paged in enumerate(urlist):
if not(i % 50):
print(f'\tBajados {str(i)} de {str(URList_len)}')
page_soup = BeautifulSoup(s.get(paged).content, 'html.parser')
import_list = page_soup.findAll("input", {'type': 'hidden',
'value': '1'})
magic_number = import_list[1].get('name')
r = s.post(paged, data={
'submit': 'Descarga',
'license_agree': '1',
'download': paged.split('/')[-1].split('-')[0],
magic_number: '1'})
fname = re.findall("filename=(.+)", r.headers['content-disposition'])
fname = fname[0].replace('"', '')
with open(os.path.join(targetDir, fname), 'wb') as targetFile:
targetFile.write(r.content)
if __name__ == '__main__':
contraloria_download_pdfs(urlist=contraloria_get_url(), targetDir=os.getcwd())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment