Skip to content

Instantly share code, notes, and snippets.

View sergiolucero's full-sized avatar
💭
coding the days away

Sergio Lucero sergiolucero

💭
coding the days away
View GitHub Profile
@sergiolucero
sergiolucero / dask_servel_search.py
Created April 9, 2020 11:57
searching servel on s3
import dask.dataframe as dd
token='LUCERO'
df=dd.read_csv('s3://quantcldata/servel/CLEAN/region*.csv')
tdf=df[df.nombre.str.contains(token)]
print(tdf.compute())
@sergiolucero
sergiolucero / lacuerda_new_scraper.py
Created March 27, 2020 21:12
Scraping cancioneros part 2
import requests, pickle, sys
from bs4 import BeautifulSoup
from docx import Document
from operator import methodcaller
url_bs = lambda url: BeautifulSoup(requests.get(url).text,'html5lib') # magic!
def recopila_acordes(artista = 'Manu Chao'): # "Manu Chao" -> manu_chao
fartist = '_'.join(map(methodcaller("lower"),artista.split()))
url = f'https://acordes.lacuerda.net/{fartist}/'
@sergiolucero
sergiolucero / joy_division.py
Last active March 24, 2020 04:34
Ridgeline plot
import altair as alt
from vega_datasets import data
source = data.seattle_weather.url
step = 20
overlap = 1
alt.Chart(source, height=step).transform_timeunit(Month='month(date)'
).transform_joinaggregate(mean_temp='mean(temp_max)', groupby=['Month']
).transform_bin(['bin_max', 'bin_min'], 'temp_max'
@sergiolucero
sergiolucero / covid_folium.py
Created March 11, 2020 21:57
plotting COVID advance
import folium, pandas as pd
from folium.plugins import MarkerCluster
pdf = pd.read_json('https://tinyurl.com/covid19-github')
pdf = pdf[pdf.data==pdf.data.max()]
pdf = pdf[pdf.totale_casi>0]
location = pdf.describe()[['lat','long']].loc['50%'].values
fm = folium.Map(location=location, zoom_start=6, tile='stamentoner',
width=800, height=600)
@sergiolucero
sergiolucero / dask_servel.py
Created March 3, 2020 18:18
Procesando el padrón del SERVEL con Dask
from dask.distributed import Client
def pdf2csv(fn):
doc = fitz.open(fn)
fw = open(fn.replace('.pdf','.csv'),'w')
writer = csv.writer(fw)
writer.writerow(['nombre','rut','genero','direccion'])
for ix,page in enumerate(doc):
t = str(page.getText().encode('latin-1'))
@sergiolucero
sergiolucero / serverl_pdf2csv.py
Last active March 20, 2021 23:18
código para pasar de un PDF del SERVEL (Plebiscito 2020) a formato CSV
import csv, glob, fitz
def pdf2csv(fn):
csv_fn = fn.replace('.pdf','.csv')
region = int(fn[1:3]) # A04101.pdf -> 4
with open(csv_fn,'w') as fw:
writer = csv.writer(fw)
writer.writerow(['nombre','rut','genero','direccion',
@sergiolucero
sergiolucero / clustermap.py
Created November 7, 2019 12:07
Folium ClusterMap
import folium
from folium.plugins import MarkerCluster
mc = MarkerCluster()
fm = folium.Map(location=[40.72,-73.98],zoom_start=5)
for _,row in ndf.iterrows():
mc.add_child(folium.Marker(location=[row['lat'],row['lon']]))
mc.add_to(fm)
fm
@sergiolucero
sergiolucero / asyncio_geocgr.py
Created October 8, 2019 01:05
asyncio scraping geocgr
import pickle, time
import asyncio
import concurrent.futures
import requests
url_base='https://www.contraloria.cl/opencgrapp/geocgr/api/comunas/%05d/newobras'
def get_comunas(region_id):
print(f'START {region_id} [{time.ctime()}]')
region_id = int(region_id)
@sergiolucero
sergiolucero / congreso_scraper.py
Created September 11, 2019 15:15
scraper congreso
import pandas as pd
import xml.etree.ElementTree as ET
import requests, sys
def datos(id):
url = f'https://www.senado.cl/wspublico/tramitacion.php?boletin={id}'
r=requests.get(url)
root = ET.fromstring(r.text)
if len(root):
pdatos = [root[0][0][ix].text for ix in [2,4,7]]
@sergiolucero
sergiolucero / google_places.py
Created August 27, 2019 03:10
google places
import requests, pandas as pd
from creds import GOOGLE_KEY
GEO_BASE = 'https://maps.googleapis.com/maps/api/geocode/json?address=%s&key=%s'
def georef(address): #1600+Amphitheatre+Parkway,+Mountain+View,+CA
url = GEO_BASE %(address, GOOGLE_KEY)
res = requests.get(url).json().get('results') #returns a list
df = pd.DataFrame(res)