Skip to content

Instantly share code, notes, and snippets.

@dansku
Created August 12, 2012 03:15
Show Gist options
  • Save dansku/3329398 to your computer and use it in GitHub Desktop.
Save dansku/3329398 to your computer and use it in GitHub Desktop.
import urllib
import urllib2
from bs4 import BeautifulSoup
import re
import time
#---[ Functions ]---#
def isTimeFormat(input):
try:
time.strptime(input, '%H:%M')
return True
except ValueError:
return False
#expressao regular pra remover a sujeira do campo da linha de onibus
linha_r = re.compile('[\t\n\r\-\|]+')
url = "http://www.pmf.sc.gov.br/servicos/index.php?pagina=onibus"
f = urllib2.urlopen(url)
doc = BeautifulSoup(f.read())
# print doc
EMPRESAS = []
for e in doc.select('select[name="empresa"] > option'):
if e.has_key("title"):
EMPRESAS.append((e["title"].strip().encode("utf-8"), int(e["value"])))
empresaId = 0
horarioIdIda = 0
horarioIdVolta = 0
itinaratioId = 0
for empresa,id in EMPRESAS:
print 'EMPRESA :' + empresa
empresaId = empresaId +1
#Cria Empresa no banco
empresadb = {
'empresaNome': empresa,
'empresaId': empresaId
}
#Salvando no DB
print 'Empresa salva'
db.empresas.save(empresadb)
data = {
"empresa":id,
"opcao":1,
"passoGeral":1,
"passoEmpresa":1,
}
req = urllib2.Request(url, urllib.urlencode(data))
doc = BeautifulSoup(urllib2.urlopen(req).read())
for linha in doc.select('ul[class="listagem"] > li'):
linha_data = linha_r.split(linha.contents[0].strip().encode("utf-8"))
linha_numero = linha_data[0].strip()
linha_nome = linha_data[1].strip()
print '---> ONIBUS: ',linha_numero,"-",linha_nome.title()
#Passo geral 1 = IDA, 2 = VOLTA, 3 = ITINERARIO
aba = {'passoGeral' : '3'}
url2 = 'http://www.pmf.sc.gov.br/servicos/index.php?pagina=onibuslinha&idLinha='+linha_numero
data = urllib.urlencode(aba) # Use urllib to encode the parameters
request = urllib2.Request(url2, data)
f = urllib2.urlopen(request) # This request is sent in HTTP POST
doc = BeautifulSoup(f.read())
x = doc.find_all('ul','listagem')[0]
for k in x.find_all('li'):
print str(k)[16:65].title(),
# Passo geral 1 = IDA, 2 = VOLTA, 3 = ITINERARIO
aba = {'passoGeral' : '1'}
url2 = 'http://www.pmf.sc.gov.br/servicos/index.php?pagina=onibuslinha&idLinha='+linha_numero
data = urllib.urlencode(aba) # Use urllib to encode the parameters
request = urllib2.Request(url2, data)
f = urllib2.urlopen(request) # This request is sent in HTTP POST
doc = BeautifulSoup(f.read())
x = str(doc.find_all(valign="top")[1])
y=0
while True:
y = x.find('<br/>',y+1)
if y == -1:
break
else:
h = x[y-5:y]
if isTimeFormat(h) is True:
print h
#Passo geral 1 = IDA, 2 = VOLTA, 3 = ITINERARIO
aba = {'passoGeral' : '2'}
url2 = 'http://www.pmf.sc.gov.br/servicos/index.php?pagina=onibuslinha&idLinha='+linha_numero
data = urllib.urlencode(aba) # Use urllib to encode the parameters
request = urllib2.Request(url2, data)
f = urllib2.urlopen(request) # This request is sent in HTTP POST
doc = BeautifulSoup(f.read())
x = str(doc.find_all(valign="top")[1])
print '------> Horario VOLTA:'
y=0
while True:
y = x.find('<br/>',y+1)
if y == -1:
break
else:
h = x[y-5:y]
if isTimeFormat(h) is True:
print h
import urllib
import urllib2
from bs4 import BeautifulSoup
import re
import time
#---[ Funcoes ]----------------------------------------
def isTimeFormat(input):
try:
time.strptime(input, '%H:%M')
return True
except ValueError:
return False
def busca_linhas(id,LINHAS):
data = {
"empresa":id,
"opcao":1,
"passoGeral":1,
"passoEmpresa":1,
}
req = urllib2.Request(url, urllib.urlencode(data))
doc = BeautifulSoup(urllib2.urlopen(req).read())
for linha in doc.select('ul[class="listagem"] > li'):
linha_data = linha_r.split(linha.contents[0].strip().encode("utf-8"))
linha_numero = linha_data[0].strip()
linha_nome = linha_data[1].strip()
linha_url = "http://www.pmf.sc.gov.br/servicos/index.php?pagina=onibuslinha&idLinha="+linha_numero+"&menu=2"
itinerario = busca_dados_linha_itinerario(linha_url)
horaida = busca_dados_linha_horaida(linha_url)
horavolta = busca_dados_linha_horavolta(linha_url)
LINHAS[linha_nome] = {"nome":linha_nome,'empresa':empresa,'numero':linha_numero,"itinerario":itinerario,"horarios_ida":horaida,"horarios_volta":horavolta}
return LINHAS[linha_nome]
# Passo Geral (3=Itinerarios) (2=Volta) (1=Ida)
def busca_dados_linha_itinerario(url):
# print url
data = {"passoGeral":3}
req = urllib2.Request(url, urllib.urlencode(data))
doc = BeautifulSoup(urllib2.urlopen(req).read())
itinerario = []
for caminho in doc.select('ul[class="listagem"] > li'):
itinerario.append(caminho.contents[0].strip().encode('utf-8')[3:])
return itinerario
def busca_dados_linha_horaida(url):
# print url
data = {"passoGeral":1}
req = urllib2.Request(url, urllib.urlencode(data))
doc = BeautifulSoup(urllib2.urlopen(req).read())
horarios = {"dias_uteis":[], "sabado":[], "domingo":[]}
for horario in doc.select('td[valign="top"]')[1].contents[::2]:
if isTimeFormat(horario.strip()[:4]) is True: horarios["dias_uteis"].append(horario.strip())
for horario in doc.select('td[valign="top"]')[2].contents[::2]:
if isTimeFormat(horario.strip()[:4]) is True: horarios["sabado"].append(horario.strip())
for horario in doc.select('td[valign="top"]')[3].contents[::2]:
if isTimeFormat(horario.strip()[:4]) is True: horarios["domingo"].append(horario.strip())
return horarios
def busca_dados_linha_horavolta(url):
# print url
data = {"passoGeral":2}
req = urllib2.Request(url, urllib.urlencode(data))
doc = BeautifulSoup(urllib2.urlopen(req).read())
horarios = {"dias_uteis":[], "sabado":[], "domingo":[]}
for horario in doc.select('td[valign="top"]')[1].contents[::2]:
if isTimeFormat(horario.strip()[:4]) is True: horarios["dias_uteis"].append(horario.strip())
for horario in doc.select('td[valign="top"]')[2].contents[::2]:
if isTimeFormat(horario.strip()[:4]) is True: horarios["sabado"].append(horario.strip())
for horario in doc.select('td[valign="top"]')[3].contents[::2]:
if isTimeFormat(horario.strip()[:4]) is True: horarios["domingo"].append(horario.strip())
return horarios
#------------[ Programa ]------------
# expressao regular pra remover a sujeira do campo da linha de onibus
linha_r = re.compile('[\t\n\r\-\|]+')
# url do site da prefeitura
url = "http://www.pmf.sc.gov.br/servicos/index.php?pagina=onibus"
# carrega o url e 'embeleza'
f = urllib2.urlopen(url)
doc = BeautifulSoup(f.read())
# le as empresas
EMPRESAS = {}
for e in doc.select('select[name="empresa"] > option'):
if e.has_key("title"):
nome = (e["title"].strip().encode("utf-8"), int(e["value"]))
EMPRESAS[nome] = {}
# carrega todas as linhas e suas informações
LINHAS = {}
for empresa,id in EMPRESAS:
print busca_linhas(id,LINHAS)
import urllib
import urllib2
from bs4 import BeautifulSoup
import re
import time
# expressao regular pra remover a sujeira do campo da linha de onibus
linha_r = re.compile('[\t\n\r\-\|]+')
url = "http://www.pmf.sc.gov.br/servicos/index.php?pagina=onibus"
f = urllib2.urlopen(url)
doc = BeautifulSoup(f.read())
# print doc
EMPRESAS = {}
for e in doc.select('select[name="empresa"] > option'):
if e.has_key("title"):
nome = (e["title"].strip().encode("utf-8"), int(e["value"]))
EMPRESAS[nome] = {}
def isTimeFormat(input):
try:
time.strptime(input, '%H:%M')
return True
except ValueError:
return False
def busca_linhas(id,LINHAS):
data = {
"empresa":id,
"opcao":1,
"passoGeral":1,
"passoEmpresa":1,
}
req = urllib2.Request(url, urllib.urlencode(data))
doc = BeautifulSoup(urllib2.urlopen(req).read())
for linha in doc.select('ul[class="listagem"] > li'):
linha_data = linha_r.split(linha.contents[0].strip().encode("utf-8"))
linha_numero = linha_data[0].strip()
linha_nome = linha_data[1].strip()
linha_url = "http://www.pmf.sc.gov.br/servicos/index.php?pagina=onibuslinha&idLinha="+linha_numero+"&menu=2"
itinerario = busca_dados_linha_itinerario(linha_url)
horaida = busca_dados_linha_horaida(linha_url)
horavolta = busca_dados_linha_horavolta(linha_url)
LINHAS[linha_nome] = {"nome":linha_nome,'empresa':empresa,'numero':linha_numero,"itinerario":itinerario,"horarios_ida":horaida,"horarios_volta":horavolta}
print '---------------------------------//-------//---->'
print LINHAS[linha_nome]
# Passo Geral (3=Itinerarios) (2=Volta) (1=Ida)
def busca_dados_linha_itinerario(url):
# print url
data = {"passoGeral":3}
req = urllib2.Request(url, urllib.urlencode(data))
doc = BeautifulSoup(urllib2.urlopen(req).read())
itinerario = []
for caminho in doc.select('ul[class="listagem"] > li'):
itinerario.append(caminho.contents[0].strip().encode('utf-8')[3:])
return itinerario
def busca_dados_linha_horaida(url):
# print url
data = {"passoGeral":1}
req = urllib2.Request(url, urllib.urlencode(data))
doc = BeautifulSoup(urllib2.urlopen(req).read())
horarios = {"dias_uteis":[], "sabado":[], "domingo":[]}
for horario in doc.select('td[valign="top"]')[1].contents[::2]:
if isTimeFormat(horario.strip()[:4]) is True: horarios["dias_uteis"].append(horario.strip())
for horario in doc.select('td[valign="top"]')[2].contents[::2]:
if isTimeFormat(horario.strip()[:4]) is True: horarios["sabado"].append(horario.strip())
for horario in doc.select('td[valign="top"]')[3].contents[::2]:
if isTimeFormat(horario.strip()[:4]) is True: horarios["domingo"].append(horario.strip())
return horarios
def busca_dados_linha_horavolta(url):
# print url
data = {"passoGeral":2}
req = urllib2.Request(url, urllib.urlencode(data))
doc = BeautifulSoup(urllib2.urlopen(req).read())
horarios = {"dias_uteis":[], "sabado":[], "domingo":[]}
for horario in doc.select('td[valign="top"]')[1].contents[::2]:
if isTimeFormat(horario.strip()[:4]) is True: horarios["dias_uteis"].append(horario.strip())
for horario in doc.select('td[valign="top"]')[2].contents[::2]:
if isTimeFormat(horario.strip()[:4]) is True: horarios["sabado"].append(horario.strip())
for horario in doc.select('td[valign="top"]')[3].contents[::2]:
if isTimeFormat(horario.strip()[:4]) is True: horarios["domingo"].append(horario.strip())
return horarios
LINHAS = {}
for empresa,id in EMPRESAS:
# print empresa
busca_linhas(id,LINHAS)
# print urllib.urlencode({"empresa",
print EMPRESAS
print LINHAS
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment