Skip to content

Instantly share code, notes, and snippets.

@vaclavcadek
Created August 5, 2014 13:17
Show Gist options
  • Save vaclavcadek/084d9f63df5d48f24345 to your computer and use it in GitHub Desktop.
Save vaclavcadek/084d9f63df5d48f24345 to your computer and use it in GitHub Desktop.
Simple web scraper for collecting all ARST schedules (for offline use :))
import urllib2
from bs4 import BeautifulSoup
import os
html_doc_example = """
<html><head><title>ARST - Orari</title></head>
<div onclick="openClose('a1')" class="mainExpand"><h2><a style="cursor:pointer; src="pdfmini.jpg"/>ABBASANTA</h2></div>
<div id="a1" class="texter">
<a href="402.pdf" target="_blank"><img src="pdfmini.jpg" />402 - ABBASANTA-GHILARZA-AIDOMAGGIORE-NORBELLO-ABBASANTA</a></br>
<a href="413.pdf" target="_blank"><img src="pdfmini.jpg" />413 - ABBASANTA-ARDAULI-NEONELI-ORISTANO</a></br>
<a href="414.pdf" target="_blank"><img src="pdfmini.jpg" />414 - BUSACHI-ARDAULI-SORRADILE-ABBASANTA-ORISTANO</a></br>
<a href="415.pdf" target="_blank"><img src="pdfmini.jpg" />415 - SAMUGHEO-FORDONGIANUS-ULA TIRSO-BUSACHI-ABBASANTA</a></br>
<a href="423.pdf" target="_blank"><img src="pdfmini.jpg" />423 - ABBASANTA-S.LUSSURGIU-MILIS-ORISTANO</a></br>
<a href="424.pdf" target="_blank"><img src="pdfmini.jpg" />424 - ORISTANO-S.VERO MILIS-MILIS-S.LUSSURGIU-ABBASANTA</a></br>
<a href="435.pdf" target="_blank"><img src="pdfmini.jpg" />435 - ABBASANTA-GHILARZA-AIDOMAGGIORE-BORORE-DUALCHI-SEDILO-OTTANA-NUORO</a></br>
<a href="497.pdf" target="_blank"><img src="pdfmini.jpg" />497 - AUTOLINEA OCCASIONALE: ABBASANTA-PAULILATINO-BONARCADO (SANTUARIO)</a></br>
<a href="502.pdf" target="_blank"><img src="pdfmini.jpg" />502 - FONNI-DESULO-SORGONO-ABBASANTA</a></br>
<a href="530.pdf" target="_blank"><img src="pdfmini.jpg" />530 - SORGONO-ORTUERI-ABBASANTA-ORISTANO</a></br>
<a href="713.pdf" target="_blank"><img src="pdfmini.jpg" />713 - ALGHERO-URI-ITTIRI-THIESI-MACOMER</a></br>
</div>
<div onclick="openClose('a2')" class="mainExpand"><h2><a style="cursor:pointer;">AGGIUS</a></h2></div>
<div id="a2" class="texter">
<a href="9316.pdf" target="_blank"><img src="pdfmini.jpg" />9316 - TEMPIO-AGLIENTU-VIGNOLA-S.TERESA DI GALLURA</a></br>
<a href="9319.pdf" target="_blank"><img src="pdfmini.jpg" />9319 - CALANGIANUS-TEMPIO P.-TRINITA' D'AGULTU-ISOLA ROSSA</a></br>
<a href="9328.pdf" target="_blank"><img src="pdfmini.jpg" />9328 - TEMPIO-LUOGOSANTO-PALAU (VIA MONTAGNA)</a></br>
</div>
"""
BASE_URL = 'http://www.arstspa.info/'
start_letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'i', 'j', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'z']
if not os.path.exists('collected'):
os.makedirs('collected')
for letter in start_letters:
html_doc = urllib2.urlopen(BASE_URL + letter + '.html').read()
source = BeautifulSoup(html_doc.replace('</br>', ''))
cities = []
links = []
for tag in source.find_all('div'):
clazz = tag.get('class')
if clazz:
if clazz[0] == 'mainExpand':
cities.append(tag.h2.text)
if clazz[0] == 'texter':
links.append(map(lambda l: (l.text, BASE_URL + l['href']), tag.find_all('a', href=True)))
cities_with_links = dict(zip(cities, links))
print 'Processing cities starting with \'%s\'' % letter
letter_dir = 'collected' + os.sep + letter
if not os.path.exists(letter_dir):
os.makedirs(letter_dir)
for key, value in cities_with_links.iteritems():
city_dir = letter_dir + os.sep + key
if not os.path.exists(city_dir):
os.makedirs(city_dir)
for v in value:
link = v[1]
file_path = city_dir + os.sep + (v[0] + '.pdf').replace('/', '-')
try:
pdf_raw = urllib2.urlopen(link).read()
local_file = open(file_path, 'w')
local_file.write(pdf_raw)
local_file.close()
except urllib2.HTTPError, e:
print 'PDF at %s unreachable (broken-link)!' % link
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment