Created
June 4, 2016 21:51
-
-
Save 648trindade/111e3ab52a5544de7a04a3cd50667adc to your computer and use it in GitHub Desktop.
Scripts para consultas em lote ao sistema de certificados da Olimpiada Brasileira de Informatica
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# Script para consultas em lote ao sistema de certificados da Olimpiada Brasileira de Informatica | |
# http://olimpiada.ic.unicamp.br/ | |
# Obs.: Emissao de certificados inicia em 2005 (OBI iniciou em 1999) | |
# Adaptado da versão original em bash escrito pela prof. Andrea | |
# Dependencias: python3-requests | |
# Author: Rafael | |
import requests | |
s = requests.Session() | |
estados = ('AC','AL','AM','AP','BA','CE','DF','ES','GO','MA','MG','MS','MT','PA','PB','PE','PI','PR','RJ','RN','RO','RS','RR','SE','SC','SP','TO') | |
data = {'partic_type':'compet', | |
'compet_name': '', | |
'compet_type':'choose', | |
'school_name':'', | |
'school_city':'', | |
'school_state':'choose', | |
'order':'school_state', | |
'batch_size':'10000', | |
'show':'Consulta'} | |
for ano in range(2005,2016): | |
matriz = [] | |
for uf in estados: | |
data['school_state'] = uf | |
r = s.post('http://olimpiada.ic.unicamp.br/passadas/OBI' + str(ano) + '/certif/py_consult', data) | |
r = s.get('http://olimpiada.ic.unicamp.br/passadas/OBI' +str(ano) + '/certif/ConsultaCompet') | |
html = r.text | |
print (ano, uf) | |
# reduz o html pra somente a area que interessa (tabela) | |
inicio = html.find('<tr class="row-light">') | |
fim = html.find('</table>', inicio) | |
html = html[inicio:fim] | |
# elimina trechos de tag desnecessarios | |
for trecho in ('<tr class="row-light">', '<tr class="row-dark">', ' <td>', '</a>'): | |
html = html.replace(trecho,'') | |
# removendo links de certificados | |
inicio = html.find(' <a href="certificado.pdf?id=') | |
while (inicio >= 0): | |
fim = html.find('">', inicio)+2 | |
substr = html[inicio:fim] | |
html = html.replace(substr,'') | |
inicio = html.find(' <a href="certificado.pdf?id=', inicio) | |
# removendo quebras de linha | |
html = html.replace("\n </td>","</td>") | |
html = html.replace("\n","") | |
# converte pra matriz | |
lista_alunos = html.split("</tr>")[:-1] | |
for aluno in lista_alunos: | |
matriz.append(aluno.split("</td>")[:-1]) | |
# capitaliza nome | |
matriz[-1][1] = " ".join(n.capitalize() for n in matriz[-1][1].split(" ")) | |
# escreve o arquivo | |
with open(str(ano)+".csv", "w") as file: | |
for aluno in matriz: | |
file.write(",".join(aluno) + "\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Script para consultas em lote ao sistema de certificados da Olimpiada Brasileira de Informatica | |
# http://olimpiada.ic.unicamp.br/ | |
# Obs.: Emissao de certificados inicia em 2005 (OBI iniciou em 1999) | |
# Requisicoes obtidas com o Firebug, módulo Net (analise do trafego de rede), opção 'Copy as cURL' | |
# Dependencias: curl, iconv | |
# Author: andrea | |
# Consulta competidores de cada ano | |
for ano in `seq 2005 2015`; do | |
for uf in AC AL AM AP BA CE DF ES GO MA MG MS MT PA PB PE PI PR RJ RN RO RS RR SE SC SP TO; do | |
echo $ano $uf | |
curl 'http://olimpiada.ic.unicamp.br/passadas/OBI'$ano'/certif/py_consult' -H 'Host: olimpiada.ic.unicamp.br' -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:41.0) Gecko/20100101 Firefox/41.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' -H 'Accept-Language: pt-BR,pt;q=0.5' --compressed -H 'Referer: http://olimpiada.ic.unicamp.br/passadas/OBI'$ano'/certif' -H 'Cookie: __utma=100712773.1617962711.1439325652.1439325652.1439325652.1; __utmz=100712773.1439325652.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); _ga=GA1.2.1617962711.1439325652; _gat=1; _ZopeId="23404925A7Ik0m5C3Ko"' -H 'Connection: keep-alive' --data 'partic_type=compet&compet_name=&compet_type=choose&school_name=&school_city=&school_state='$uf'&order=school_state&batch_size=30000&show=Consulta' | |
curl 'http://olimpiada.ic.unicamp.br/passadas/OBI'$ano'/certif/ConsultaCompet' -H 'Host: olimpiada.ic.unicamp.br' -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:41.0) Gecko/20100101 Firefox/41.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' -H 'Accept-Language: pt-BR,pt;q=0.5' --compressed -H 'Referer: http://olimpiada.ic.unicamp.br/passadas/OBI'$ano'/certif' -H 'Cookie: __utma=100712773.1617962711.1439325652.1439325652.1439325652.1; __utmz=100712773.1439325652.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); _ga=GA1.2.1617962711.1439325652; _gat=1; _ZopeId="23404925A7Ik0m5C3Ko"' -H 'Connection: keep-alive' 2>/dev/null | grep -a -e '</\?table\|</\?td\|</\?tr\|</\?th\|</\?a' | sed 's/^[\ \t]*//g' | tr -d '\n' | sed 's/<\/tr[^>]*>/\n/g' | sed 's/<\/\?\(table\|tr\)[^>]*>//g' | sed 's/^<t[dh][^>]*>\|<\/\?t[dh][^>]*>$//g' | sed 's/<\/t[dh][^>]*><t[dh][^>]*>/,/g' | sed 's/<a href[^>]*>//g' | sed 's/<\/a>//g' | sed '/<[^>]*>/d' | iconv -f iso-8859-1 -t utf-8 >> a_$ano.csv | |
done | |
sed '/^$/d' a_$ano.csv > b_$ano.csv | |
sed '/Num. Inscr.,Nome,Modalidade,Escola,Cidade,Estado/d' b_$ano.csv > $ano.csv | |
rm -f a_$ano.csv b_$ano.csv | |
wc -l $ano.csv | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment