Created
February 13, 2012 21:23
-
-
Save tcabrol/1820614 to your computer and use it in GitHub Desktop.
Crawling data.gouv.fr for Tax Data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
""" | |
tax_data.py | |
Created by Thomas Cabrol on 2012-02-12. | |
""" | |
import re | |
import os | |
import urllib | |
from BeautifulSoup import BeautifulSoup | |
import xlrd | |
import simplejson | |
import sys | |
import codecs | |
from datetime import datetime | |
# Set some constants.... | |
RAW_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'raw_data') | |
DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data') | |
class Downloader(object): | |
''' Pure hackery to crawl data.gouv.fr search results | |
and download the files of interest. Please do not try to use it as is for anything else | |
than the Tax data where are looking for ! ''' | |
def __init__(self): | |
self.base_url = "http://www.data.gouv.fr/" | |
self.search_url = "content/search/(offset)/" | |
self.search_string = "?SearchText=&Type=data&Contexte=q%3Dtype%253Adata%26add_hit_meta%3Dhtml_simple_view%2540html_simple_view%26sort_ascending%3D0%26r%3DTop%252Fprimary_producer%252Fministere%2Bdu%2Bbudget%252C%2Bdes%2Bcomptes%2Bpublics%2Bet%2Bde%2Bla%2Breforme%2Bde%2Bl%2527etat%26r%3DTop%252Fkeywords%252Fimpot%2Bsur%2Ble%2Brevenu%26r%3DTop%252Fyear_interval%252F2009&Facet=Top/year_interval/2009" | |
def get_files(self, max_offset): | |
''' Returns a list of all Excel files to download ''' | |
self.files = [] | |
for offset in xrange(0, max_offset, 10): | |
search_full_url = self.base_url + self.search_url + str(offset) + self.search_string | |
html = urllib.urlopen(search_full_url).read() | |
soup = BeautifulSoup(html) | |
for data in soup.findAll('p', { 'class' : 'download' }): | |
link = data('a')[0]['href'] | |
if '.xls' in link: | |
if link not in self.files: | |
self.files.append(link) | |
print >>sys.stdout, "%i files found...\n" % len(self.files) | |
return self.files | |
def download(self): | |
''' Actually download the files ''' | |
if not os.path.isdir(RAW_DATA_DIR): | |
os.makedir(RAW_DATA_DIR) | |
for xl_file in self.get_files(200): | |
xl_file_url = self.base_url + xl_file | |
xl_file_name = xl_file.split('/')[-1] | |
xl_file_local = os.path.join(RAW_DATA_DIR, xl_file_name) | |
print >>sys.stdout, "Downloading %s..." % xl_file_url | |
urllib.urlretrieve(xl_file_url, xl_file_local) | |
class ExcelExtractor(object): | |
''' Simple class that takes data from the source xls | |
files and append it into a tsv file ''' | |
def __init__(self): | |
''' Instantiate the object with some values related to | |
the schema of the xls files ''' | |
if not os.path.isdir(DATA_DIR): | |
os.makedir(DATA_DIR) | |
self.outFile = os.path.join(DATA_DIR, 'raw_tax_data.tsv') | |
self.sheet_index = 0 | |
self.min_row = 27 | |
self.max_row = 20000 | |
self.min_col = 1 | |
self.max_col = 14 | |
self.columns_map = { | |
0 : {"description" : u"DEP", "name" : u"dep"} , | |
1 : {"description" : u"Commune", "name" : u"commune"} , | |
2 : {"description" : u"Libellé de la name", "name" : u"label_commune"} , | |
3 : {"description" : u"Revenu fiscal de référence par tranche (en euros)", "name" : u"segment_fiscal_revenue"} , | |
4 : {"description" : u"Nombre de foyers fiscaux", "name" : u"count_fiscal_hh"} , | |
5 : {"description" : u"Revenu fiscal de référence des foyers fiscaux", "name" : u"ref_fiscal_rev_fiscal_hh"} , | |
6 : {"description" : u"Impôt net (total)", "name" : u"net_tax"} , | |
7 : {"description" : u"Nombre de foyers fiscaux imposables", "name" : u"count_fiscal_hh_taxable"} , | |
8 : {"description" : u"Revenu fiscal de référence des foyers fiscaux imposables", "name" : u"ref_fiscal_rev_fiscal_hh_taxable"} , | |
9 : {"description" : u"Traitements et salaires - Nombre de foyers concernés", "name" : u"salaries_actual_hh_count"} , | |
10: {"description" : u"Traitements et salaires - Montant", "name" : u"salaries_amount"} , | |
11: {"description" : u"Retraites et pensions - Nombre de foyers concernés", "name" : u"retirement_actual_hh_count"} , | |
12: {"description" : u"Retraites et pensions - Montant", "name" : u"retirement_amount"} | |
} | |
def list_xls(self): | |
''' Returns the list of xls files to be parsed ''' | |
return [f for f in os.listdir(RAW_DATA_DIR) if re.match('^(\d{3})(.xls)', f)] | |
def printable(self, item): | |
if isinstance(item, basestring): | |
return item | |
return str(item) | |
def xls_to_tsv(self): | |
t = codecs.open(self.outFile, 'w', 'utf-8') | |
t.write('\t'.join(v['description'] for k, v in e.columns_map.items()) + '\n') | |
for infile in self.list_xls(): | |
try: | |
excel = xlrd.open_workbook(os.path.join(RAW_DATA_DIR, infile)) | |
except: | |
print >>sys.stdout, "Failed to open %s..." % infile | |
sheet = excel.sheet_by_index(self.sheet_index) | |
for row in xrange(self.min_row, self.max_row): | |
try: | |
record = sheet.row_values(row, start_colx = self.min_col, end_colx = self.max_col) | |
if not len(record) == len(self.columns_map): | |
print >>sys.stdout, "Line does not match : %s" % record | |
t.write('\t'.join(map(self.printable, record)) + '\n') | |
except: | |
#print >>sys.stdout, sys.exc_info()[1] | |
pass | |
t.close() | |
if __name__ == '__main__': | |
print "%s : Process starts..." % datetime.now() | |
d = Downloader() | |
d.download() | |
e = ExcelExtractor() | |
e.xls_to_tsv() | |
print "%s : Process ends..." % datetime.now() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment