Skip to content

Instantly share code, notes, and snippets.

@seungjin
Created August 27, 2010 17:21
Show Gist options
  • Save seungjin/553767 to your computer and use it in GitHub Desktop.
Save seungjin/553767 to your computer and use it in GitHub Desktop.
mysql table exported xml parse with python
# journal parser
"""
<row>
<id>2183</id>
<date>2010-08-27</date>
<time>09:04:12</time>
<timezone>MST</timezone>
<tag>Scratch</tag>
<subject>need to enrich my life culturally, technically, humanly.. Matisse at MoMA: Carving With Color http://nyti.ms/ajIfKK http://seungjin.posterous.com/26733126</subject>
<publishing_code>1</publishing_code>
<body>&lt;null&gt;</body>
<ref>&lt;null&gt;</ref>
<created_at>2010-08-27 09:05:35</created_at>
</row>
"""
import xml.dom.minidom
from xml.dom.minidom import Node
doc = xml.dom.minidom.parse("journals.xml")
#doc = xml.dom.minidom.parseString(source.encode('utf-8'))
for journals in doc.getElementsByTagName("journals"):
rows = journals.getElementsByTagName("row")
for row in rows:
id = row.getElementsByTagName("id")[0].childNodes[0].data
date = row.getElementsByTagName("date")[0].childNodes[0].data
time = row.getElementsByTagName("time")[0].childNodes[0].data
timezone = row.getElementsByTagName("timezone")[0].childNodes[0].data
tag = row.getElementsByTagName("tag")[0].childNodes[0].data
subject = row.getElementsByTagName("subject")[0].childNodes[0].data
publishing_code = row.getElementsByTagName("publishing_code")[0].childNodes[0].data
body = row.getElementsByTagName("body")[0].childNodes[0].data
ref = row.getElementsByTagName("ref")[0].childNodes[0].data
created_at = row.getElementsByTagName("created_at")[0].childNodes[0].data
if id == "<null>": id = None
if date == "<null>": date = None
if time == "<null>": time = None
if timezone == "<null>": timezone = None
if tag == "<null>": tag = None
if subject == "<null>": subject = None
if publishing_code == "<null>": publishing_code = None
if body == "<null>": body = None
if ref == "<null>": ref = None
if created_at == "<null>": created_at = None
print id
print date
print time
print timezone
if tag is not None: print tag.encode('utf-8')
else: print tag
if subject is not None: print subject.encode('utf-8')
else: print subject
print publishing_code
if body is not None: print body.encode('utf-8')
else: print body
if ref is not None: print ref.encode('utf-8')
else: print ref
print created_at
print
# google appengine with so nice with utf-8. I did not need all those .encode methods for google appengine.:-)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment