Last active
April 15, 2018 15:37
-
-
Save marcelometal/5c5f77f6076a5fcc06783b1b7c1657d6 to your computer and use it in GitHub Desktop.
ElasticSearch: Candidatos politicos brasileiros
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# | |
# Copyright (c) 2018, Marcelo Jorge Vieira <[email protected]> | |
# | |
# This program is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU Affero General Public License as | |
# published by the Free Software Foundation, either version 3 of the | |
# License, or (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU Affero General Public License for more details. | |
# | |
# You should have received a copy of the GNU Affero General Public License | |
# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
fields2000 = [ | |
'DATA_GERACAO', | |
'HORA_GERACAO', | |
'ANO_ELEICAO', | |
'NUM_TURNO', | |
'DESCRICAO_ELEICAO', | |
'SIGLA_UF', | |
'SIGLA_UE', | |
'DESCRICAO_UE', | |
'CODIGO_CARGO', | |
'DESCRICAO_CARGO', | |
'NOME_CANDIDATO', | |
'SEQUENCIAL_CANDIDATO', | |
'NUMERO_CANDIDATO', | |
'CPF_CANDIDATO', | |
'NOME_URNA_CANDIDATO', | |
'COD_SITUACAO_CANDIDATURA', | |
'DES_SITUACAO_CANDIDATURA', | |
'NUMERO_PARTIDO', | |
'SIGLA_PARTIDO', | |
'NOME_PARTIDO', | |
'CODIGO_LEGENDA', | |
'SIGLA_LEGENDA', | |
'COMPOSICAO_LEGENDA', | |
'NOME_LEGENDA', | |
'CODIGO_OCUPACAO', | |
'DESCRICAO_OCUPACAO', | |
'DATA_NASCIMENTO', | |
'NUM_TITULO_ELEITORAL_CANDIDATO', | |
'IDADE_DATA_ELEICAO', | |
'CODIGO_SEXO', | |
'DESCRICAO_SEXO', | |
'COD_GRAU_INSTRUCAO', | |
'DESCRICAO_GRAU_INSTRUCAO', | |
'CODIGO_ESTADO_CIVIL', | |
'DESCRICAO_ESTADO_CIVIL', | |
'CODIGO_NACIONALIDADE', | |
'DESCRICAO_NACIONALIDADE', | |
'SIGLA_UF_NASCIMENTO', | |
'CODIGO_MUNICIPIO_NASCIMENTO', | |
'NOME_MUNICIPIO_NASCIMENTO', | |
'DESPESA_MAX_CAMPANHA', | |
'COD_SIT_TOT_TURNO', | |
'DESC_SIT_TOT_TURNO', | |
] | |
fields2012 = [ | |
'DATA_GERACAO', | |
'HORA_GERACAO', | |
'ANO_ELEICAO', | |
'NUM_TURNO', | |
'DESCRICAO_ELEICAO', | |
'SIGLA_UF', | |
'SIGLA_UE', | |
'DESCRICAO_UE', | |
'CODIGO_CARGO', | |
'DESCRICAO_CARGO', | |
'NOME_CANDIDATO', | |
'SEQUENCIAL_CANDIDATO', | |
'NUMERO_CANDIDATO', | |
'CPF_CANDIDATO', | |
'NOME_URNA_CANDIDATO', | |
'COD_SITUACAO_CANDIDATURA', | |
'DES_SITUACAO_CANDIDATURA', | |
'NUMERO_PARTIDO', | |
'SIGLA_PARTIDO', | |
'NOME_PARTIDO', | |
'CODIGO_LEGENDA', | |
'SIGLA_LEGENDA', | |
'COMPOSICAO_LEGENDA', | |
'NOME_LEGENDA', | |
'CODIGO_OCUPACAO', | |
'DESCRICAO_OCUPACAO', | |
'DATA_NASCIMENTO', | |
'NUM_TITULO_ELEITORAL_CANDIDATO', | |
'IDADE_DATA_ELEICAO', | |
'CODIGO_SEXO', | |
'DESCRICAO_SEXO', | |
'COD_GRAU_INSTRUCAO', | |
'DESCRICAO_GRAU_INSTRUCAO', | |
'CODIGO_ESTADO_CIVIL', | |
'DESCRICAO_ESTADO_CIVIL', | |
'CODIGO_NACIONALIDADE', | |
'DESCRICAO_NACIONALIDADE', | |
'SIGLA_UF_NASCIMENTO', | |
'CODIGO_MUNICIPIO_NASCIMENTO', | |
'NOME_MUNICIPIO_NASCIMENTO', | |
'DESPESA_MAX_CAMPANHA', | |
'COD_SIT_TOT_TURNO', | |
'DESC_SIT_TOT_TURNO', | |
'NM_EMAIL', | |
] | |
fields2014 = [ | |
'DATA_GERACAO', | |
'HORA_GERACAO', | |
'ANO_ELEICAO', | |
'NUM_TURNO', | |
'DESCRICAO_ELEICAO', | |
'SIGLA_UF', | |
'SIGLA_UE', | |
'DESCRICAO_UE', | |
'CODIGO_CARGO', | |
'DESCRICAO_CARGO', | |
'NOME_CANDIDATO', | |
'SEQUENCIAL_CANDIDATO', | |
'NUMERO_CANDIDATO', | |
'CPF_CANDIDATO', | |
'NOME_URNA_CANDIDATO', | |
'COD_SITUACAO_CANDIDATURA', | |
'DES_SITUACAO_CANDIDATURA', | |
'NUMERO_PARTIDO', | |
'SIGLA_PARTIDO', | |
'NOME_PARTIDO', | |
'CODIGO_LEGENDA', | |
'SIGLA_LEGENDA', | |
'COMPOSICAO_LEGENDA', | |
'NOME_LEGENDA', | |
'CODIGO_OCUPACAO', | |
'DESCRICAO_OCUPACAO', | |
'DATA_NASCIMENTO', | |
'NUM_TITULO_ELEITORAL_CANDIDATO', | |
'IDADE_DATA_ELEICAO', | |
'CODIGO_SEXO', | |
'DESCRICAO_SEXO', | |
'COD_GRAU_INSTRUCAO', | |
'DESCRICAO_GRAU_INSTRUCAO', | |
'CODIGO_ESTADO_CIVIL', | |
'DESCRICAO_ESTADO_CIVIL', | |
'CODIGO_COR_RACA', | |
'DESCRICAO_COR_RACA', | |
'CODIGO_NACIONALIDADE', | |
'DESCRICAO_NACIONALIDADE', | |
'SIGLA_UF_NASCIMENTO', | |
'CODIGO_MUNICIPIO_NASCIMENTO', | |
'NOME_MUNICIPIO_NASCIMENTO', | |
'DESPESA_MAX_CAMPANHA', | |
'COD_SIT_TOT_TURNO', | |
'DESC_SIT_TOT_TURNO', | |
'NM_EMAIL', | |
] | |
csv_headers = { | |
'2000': fields2000, | |
'2002': fields2000, | |
'2004': fields2000, | |
'2006': fields2000, | |
'2008': fields2000, | |
'2010': fields2000, | |
'2012': fields2012, | |
'2014': fields2014, | |
'2016': fields2014, | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# | |
# Copyright (c) 2018, Marcelo Jorge Vieira <[email protected]> | |
# | |
# This program is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU Affero General Public License as | |
# published by the Free Software Foundation, either version 3 of the | |
# License, or (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU Affero General Public License for more details. | |
# | |
# You should have received a copy of the GNU Affero General Public License | |
# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
import csv | |
import glob | |
from elasticsearch import Elasticsearch | |
from elasticsearch import helpers | |
from pandas import read_csv | |
from pandas.errors import EmptyDataError | |
from fields import csv_headers | |
es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) | |
INDEX_NAME = 'politicos' | |
DOC_TYPE = 'people' | |
FILES_DIR = '/home/metal/Downloads/consulta_cand' | |
OBJECT_LIST_MAXIMUM_COUNTER = 1000 | |
def search(prop, value): | |
result = es.search( | |
index=INDEX_NAME, | |
body={'query': {'match': {prop: value}}} | |
) | |
print result | |
def create(): | |
es.indices.create(index=INDEX_NAME, ignore=400) | |
es.indices.put_settings( | |
index=INDEX_NAME, | |
body={ | |
'index.blocks.write': False, | |
'index.blocks.read_only_allow_delete': False | |
} | |
) | |
def csv2dict(filename): | |
csv_rows = [] | |
with open(filename) as csvfile: | |
reader = csv.DictReader(csvfile) | |
names = reader.fieldnames | |
for row in reader: | |
csv_rows.extend( | |
[ | |
{names[i]: row[names[i]].decode('ISO-8859-1') | |
for i in range(len(names))} | |
] | |
) | |
return csv_rows | |
def all_elections(): | |
for year in csv_headers.keys(): | |
all_candidates(str(year)) | |
def all_candidates(year): | |
election_dir = '{0}/consulta_cand_{1}'.format(FILES_DIR, year) | |
filenames = glob.glob('{0}/*.txt'.format(election_dir)) | |
for filename in filenames: | |
candidates(filename, year) | |
def insert_candidates(es, actions): | |
try: | |
print helpers.bulk(es, actions, index=INDEX_NAME, doc_type=DOC_TYPE) | |
except: | |
es.indices.put_settings( | |
index=INDEX_NAME, | |
body={ | |
'index.blocks.write': False, | |
'index.blocks.read_only_allow_delete': False | |
} | |
) | |
print helpers.bulk(es, actions, index=INDEX_NAME, doc_type=DOC_TYPE) | |
def candidates(filename, year): | |
try: | |
df = read_csv(filename, delimiter=';') | |
print filename | |
except EmptyDataError: | |
print '{} is empty'.format(filename) | |
return | |
df.columns = csv_headers.get(year) | |
header_file = '{}_header.csv'.format(filename) | |
df.to_csv(header_file, index=False) | |
rows = csv2dict(header_file) | |
actions = [] | |
for row in rows: | |
actions.append({ | |
'_op_type': 'index', | |
'_index': INDEX_NAME, | |
'_type': DOC_TYPE, | |
'_source': row, | |
}) | |
if len(actions) == OBJECT_LIST_MAXIMUM_COUNTER: | |
insert_candidates(es, actions) | |
actions = [] | |
if actions: | |
insert_candidates(es, actions) | |
actions = [] | |
def main(): | |
# create() | |
# all_candidates('2016') | |
# cands_dir = '{0}/consulta_cand_{1}'.format(FILES_DIR, '2016') | |
# filename = '{0}/consulta_cand_{1}_RJ.txt'.format(cands_dir, '2016') | |
# candidates(filename) | |
# all_elections() | |
search('DESCRICAO_CARGO', 'Vereador') | |
if __name__ == '__main__': | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
elasticsearch>=6.0.0,<7.0.0 | |
pandas>=0.22.0,<0.23.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment