Created
April 19, 2011 09:10
-
-
Save justinvw/927047 to your computer and use it in GitHub Desktop.
Simple script to convert a CSV file to XML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
""" | |
csv2xml.py | |
Created by Justin van Wees on 2011-04-18. | |
""" | |
import sys | |
import os | |
import string | |
import re | |
import csv | |
import libxml2 | |
VERSION = '0.1 (2011-04-18)' | |
class ConvertToXML(object): | |
def __init__(self, options, source_csv, dest_file=None, ): | |
self.csv = self.parse_csv(filename=source_csv, delimiter=options.delimiter, | |
quotechar=options.quotechar) | |
self.headers = self.parse_headers(self.csv[0]) | |
self.xml = self.create_xml(root_element=options.xml_root, | |
record_element=options.xml_record, | |
headers=self.headers, csv=self.csv) | |
if dest_file: | |
self.save(dest_file, self.xml) | |
else: | |
print self.xml | |
def parse_csv(self, filename, delimiter, quotechar): | |
csv.register_dialect('custom', delimiter=delimiter, | |
quotechar=quotechar) | |
csv_file = open(filename, mode='r') | |
csv_file = list(csv.reader(csv_file)) | |
return csv_file | |
def parse_headers(self, headers): | |
punct = set(string.punctuation) | |
parsed_headers = [] | |
for head in headers: | |
# Strip punct | |
head = ''.join(ch for ch in head if ch not in punct) | |
# Strip whitespace at beginning and end of string, make lowercase | |
head = head.strip().lower() | |
# Replace space with underscores | |
head = head.replace(' ', '_') | |
parsed_headers.append(head) | |
return parsed_headers | |
def create_xml(self, root_element, record_element, headers, csv): | |
doc = libxml2.newDoc(version='1.0') | |
root = doc.newChild(None, root_element, None) | |
for record in csv[1:]: | |
this_record = root.newChild(None, record_element, None) | |
for index, header in enumerate(headers): | |
if len(record[index]) > 0: | |
this_record.newChild(None, header, record[index]) | |
else: | |
this_record.newChild(None, header, None) | |
return doc.serialize(encoding='utf-8', format=1) | |
def create_xml2(self, root_element, record_element, headers, csv): | |
doc = Document() | |
root = doc.createElement(root_element) | |
doc.appendChild(root) | |
for record in csv[1:]: | |
this_record = doc.createElement(record_element) | |
for index, header in enumerate(headers): | |
this_item = doc.createElement(unicode(header, 'utf-8')) | |
if len(record[index]) > 0: | |
this_item.appendChild(doc.createTextNode(unicode(record[index], 'utf-8'))) | |
this_record.appendChild(this_item) | |
root.appendChild(this_record) | |
print doc.toprettyxml(encoding="UTF-8") | |
def save(self, filename, xml): | |
xml_file = open(filename, 'w') | |
xml_file.write(xml) | |
xml_file.close() | |
if __name__ == '__main__': | |
from optparse import OptionParser | |
parser = OptionParser(version="%prog " + VERSION, | |
usage='%prog [options] SOURCE_CSV DEST_XML') | |
parser.disable_interspersed_args() | |
parser.add_option('-d', '--delimiter', dest='delimiter', type='str', default=',', | |
help="One-char string used to separate fields in the CSV file") | |
parser.add_option('-q', '--quote-char', dest='quotechar', type='str', | |
default='"', help="One-char string used to quote fields that contain 'special' chars") | |
parser.add_option('-r', '--root-element', dest="xml_root", type='str', default='root', | |
help="Name of the root element") | |
parser.add_option('-i', '--record-element', dest="xml_record", type='str', | |
default='record', help="Name of the record elements") | |
(options, args) = parser.parse_args() | |
if len(args) > 1: | |
xml = ConvertToXML(options, args[0], args[1]) | |
else: | |
print ConvertToXML(options, args[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment