Skip to content

Instantly share code, notes, and snippets.

@RouxRC
Forked from fmassot/gist:e48269bef73e038228ed
Last active August 29, 2015 14:17
Show Gist options
  • Save RouxRC/fcc95f4d3e8783018664 to your computer and use it in GitHub Desktop.
Save RouxRC/fcc95f4d3e8783018664 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, re, xmltodict
def convert_date(s):
if not s:
return s
d, m, y = s.split('/')
return '-'.join((y, m, d))
def parse_question(url, xmlstring):
data = xmltodict.parse(xmlstring)
qe = data['QUESTION']
if isinstance(qe['MINA']['ORDRE'], dict):
qe['MINA']['ORDRE'] = [qe['MINA']['ORDRE']]
if not qe['RENOUVELLEMENT']:
qe['RENOUVELLEMENT'] = {}
extracted_data = {
'source': url,
'legislature': qe['LEGISLATURE'],
'type': qe['@TYPE'],
'numero': qe['DEPOT'][0]['@NUMERO'],
'date_question': convert_date(qe['DEPOT'][0]['DATE_JO']),
'date_reponse': convert_date(qe['REPONSE']['DATE_JO_REPONSE']),
'date_retrait': "",
'motif_retrait': "",
'ministere_attribue': qe['MINA']['ORDRE'][-1]['DEVELOPPE'],
'ministere_interroge': qe['MINI']['DEVELOPPE'],
'tete_analyse': qe['INDEXATION_AN']['TETE_ANALYSE'],
'analyse': " / ".join(qe['INDEXATION_AN']['ANALYSE']['ANA']),
'rubrique': qe['INDEXATION_AN']['@RUBRIQUE'],
'question': qe['DEPOT'][1]['TEXTE_DEPOT'],
'reponse': qe['REPONSE']['TEXTE_REPONSE'],
'auteur': qe['AUTEUR']['PRENOM'] + ' ' + qe['AUTEUR']['NOM'],
# unused fields
'date_signalement': max(convert_date(qe.get('RENOUVELLEMENT', {}).get('DATE_JO', '')), \
convert_date(qe.get('SIGNALEMENT', {}).get('DATE_JO', ''))),
'date_cht_attr': convert_date(qe['MINA']['ORDRE'][-1]['DATE_JO']) if len(qe['MINA']['ORDRE']) > 1 else "",
'page_question': qe['DEPOT'][0]['PAGE_JO'],
'page_reponse': qe['REPONSE']['PAGE_JO_REPONSE']
}
if (not extracted_data['date_reponse']):
extracted_data['date_retrait'] = qe['CLOTURE']['DATE_JO']
if (extracted_data['date_retrait'] and qe['CLOTURE']['LIBELLE'] != u"Réponse publiée"):
extracted_data['motif_retrait'] = qe['CLOTURE']['LIBELLE'].lower()
for k, v in extracted_data.iteritems():
if not v:
v = ""
extracted_data[k] = v.encode('utf-8').replace('\\', '\\\\').replace('"', '\\"')
return extracted_data
if __name__ == '__main__':
filepath = sys.argv[1]
url = re.sub(r'^.*/([^/]+)$', r'\1', filepath).replace('_', '/').replace('/vue/xml', '')
with open(filepath, 'r') as f:
parsed_data = parse_question(url, f.read())
print "{%s}" % ", ".join('"%s": "%s"' % (k, parsed_data[k]) for k in parsed_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment