Created
April 27, 2014 10:12
-
-
Save apoloval/11342157 to your computer and use it in GitHub Desktop.
A Python script to download the navigation charts for all spanish airports from Aena website. It requires BeautifulSoup4 installed in your system.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, errno, urllib2 | |
from bs4 import BeautifulSoup | |
BASE_URL = "http://www.aena.es/csee/Satellite/navegacion-aerea/es/Page/1078418725163/?other=1083158950596#ancla3" | |
DEFAULT_RETRIES = 3 | |
def leaf_open_list(soup): | |
if not soup: | |
return None | |
r1 = soup.find("li", class_="listOpened") | |
r2 = leaf_open_list(r1) | |
if not r2: | |
return r1 | |
else: | |
return r2 | |
def make_dir(dir): | |
try: | |
os.mkdir(dir) | |
except OSError as e: | |
if e.errno == errno.EEXIST and os.path.isdir(dir): | |
pass | |
else: | |
raise | |
def download_file(url, filename, retries=DEFAULT_RETRIES): | |
try: | |
web_content = urllib2.urlopen(url) | |
target_file = open(filename, "wb") | |
target_file.write(web_content.read()) | |
target_file.close() | |
except Exception, e: | |
print("Error while downloading %s: %s" % (url, e)) | |
if retries > 0: | |
download_file(url, filename, retries - 1) | |
else: | |
raise | |
finally: | |
web_content.close() | |
target_file.close() | |
def download_html(url): | |
try: | |
return urllib2.urlopen(url).read() | |
except urllib2.HTTPError as e: | |
print("Error: cannot download webpage from URL %s: %s" % (url, e)) | |
def download(url, folder): | |
make_dir(folder) | |
body = download_html(url) | |
if not body: | |
return None | |
soup = BeautifulSoup(body) | |
open_list = leaf_open_list(soup.find(id="contentLists")).find("ul") | |
if not open_list: | |
return None | |
for li in open_list.find_all("li", class_=""): | |
url = "http://www.aena.es/%s" % li.find("a").get("href") | |
filename = li.find("a").find("strong").get_text().replace("/", "_") | |
pdf_file = "%s/%s.pdf" % (folder, filename) | |
print("Downloading %s..." % pdf_file) | |
download_file(url, pdf_file) | |
for li in open_list.find_all("li", class_="listClosed"): | |
url = "http://www.aena.es/%s" % li.find("a").get("href") | |
subfolder = "%s/%s" % (folder, li.find("a").find("strong").get_text().replace("/", "_")) | |
print("Entering subfolder %s" % subfolder) | |
download(url, subfolder) | |
download(BASE_URL, "Airports") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It doesn't work...
root@vps339592:~# python aena.py
Error: cannot download webpage from URL http://www.aena.es/csee/Satellite/navegacion-aerea/es/Page/1078418725163/?other=1083158950596#ancla3: HTTP Error 404: Not Found