Created
February 15, 2012 17:11
-
-
Save tcabrol/1837380 to your computer and use it in GitHub Desktop.
data.gouv.fr :: crawl and download files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
""" | |
get_tax_data.py | |
Created by Thomas Cabrol on 2012-02-15. | |
""" | |
import os | |
import urllib | |
import sys | |
from datetime import datetime | |
from BeautifulSoup import BeautifulSoup | |
# Set some constants.... | |
RAW_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'raw_data') | |
class Downloader(object): | |
''' Pure hack to crawl data.gouv.fr search results | |
and download the files of interest. Please do not try to use it as is for anything else | |
than the Tax data where are looking for ! ''' | |
def __init__(self): | |
''' | |
Instantiate the object using search URL components | |
''' | |
self.base_url = "http://www.data.gouv.fr/" | |
self.search_url = "content/search/(offset)/" | |
self.search_string = "?SearchText=&Type=data&Contexte=q%3Dtype%253Adata%26add_hit_meta%3Dhtml_simple_view%2540html_simple_view%26sort_ascending%3D0%26r%3DTop%252Fprimary_producer%252Fministere%2Bdu%2Bbudget%252C%2Bdes%2Bcomptes%2Bpublics%2Bet%2Bde%2Bla%2Breforme%2Bde%2Bl%2527etat%26r%3DTop%252Fkeywords%252Fimpot%2Bsur%2Ble%2Brevenu%26r%3DTop%252Fyear_interval%252F2009&Facet=Top/year_interval/2009" | |
def get_files(self, max_offset): | |
''' | |
Returns a list of all Excel files to download | |
''' | |
self.files = [] | |
for offset in xrange(0, max_offset, 10): | |
# Fetch raw data from URL | |
search_full_url = self.base_url + self.search_url + str(offset) + self.search_string | |
html = urllib.urlopen(search_full_url).read() | |
# Soup the content | |
soup = BeautifulSoup(html) | |
# Extract href links from the content | |
for data in soup.findAll('p', { 'class' : 'download' }): | |
link = data('a')[0]['href'] | |
# Filter only xls files | |
if '.xls' in link: | |
if link not in self.files: | |
self.files.append(link) | |
print >>sys.stdout, "%i files found...\n" % len(self.files) | |
return self.files | |
def download(self): | |
''' Actually download the files ''' | |
# Create the target local directory if not existing | |
if not os.path.isdir(RAW_DATA_DIR): | |
os.makedir(RAW_DATA_DIR) | |
# Loop thru pages | |
for xl_file in self.get_files(200): | |
# Get URL, name, and local path for the files | |
xl_file_url = self.base_url + xl_file | |
xl_file_name = xl_file.split('/')[-1] | |
xl_file_local = os.path.join(RAW_DATA_DIR, xl_file_name) | |
print >>sys.stdout, "Downloading %s..." % xl_file_url | |
# Actually retrieve the files | |
urllib.urlretrieve(xl_file_url, xl_file_local) | |
if __name__ == '__main__': | |
print "%s : Process starts..." % datetime.now() | |
d = Downloader() | |
d.download() | |
print "%s : Process ends..." % datetime.now() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment