Skip to content

Instantly share code, notes, and snippets.

@jtemporal
Last active May 24, 2017 02:39
Show Gist options
  • Save jtemporal/749a3cd6461e0f5d004ed23e7c8e8cdc to your computer and use it in GitHub Desktop.
Save jtemporal/749a3cd6461e0f5d004ed23e7c8e8cdc to your computer and use it in GitHub Desktop.
import unicodedata
import pandas as pd
import grequests
from serenata_toolbox.datasets import fetch
fetch('2017-05-22-brazilian-cities.csv', 'data/')
print('reading...')
br_cities = pd.read_csv('data/2017-05-22-brazilian-cities.csv')
print('done!')
print('subset')
br_cities = br_cities.head(10)
print('lowering')
br_cities['state'] = br_cities['state'].apply(lambda x: x.lower())
def normalize_string(string):
if isinstance(string, str):
nfkd_form = unicodedata.normalize('NFKD', string.lower())
return nfkd_form.encode('ASCII', 'ignore').decode('utf-8')
print('normalizing')
br_cities['normalized_name'] = br_cities['name'] \
.apply(lambda x: normalize_string(x))
print('replacing spaces')
br_cities['normalized_name'] = br_cities['normalized_name'] \
.apply(lambda x: x.replace(' ', ''))
print('normalized')
portal_url = 'https://{}-{}.portaltp.com.br/'
print('creating links')
br_cities['transparency_portal_url'] = br_cities \
.apply(lambda row: portal_url.format(row['normalized_name'],
row['state']), axis=1)
print('requests')
rs = (grequests.get(u) for u in list(br_cities['transparency_portal_url']))
def exception_handler(request, exception):
return 404
responses = grequests.map(rs, exception_handler=exception_handler)
print(responses)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment