Created
August 27, 2010 17:21
-
-
Save seungjin/553767 to your computer and use it in GitHub Desktop.
mysql table exported xml parse with python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# journal parser | |
""" | |
<row> | |
<id>2183</id> | |
<date>2010-08-27</date> | |
<time>09:04:12</time> | |
<timezone>MST</timezone> | |
<tag>Scratch</tag> | |
<subject>need to enrich my life culturally, technically, humanly.. Matisse at MoMA: Carving With Color http://nyti.ms/ajIfKK http://seungjin.posterous.com/26733126</subject> | |
<publishing_code>1</publishing_code> | |
<body><null></body> | |
<ref><null></ref> | |
<created_at>2010-08-27 09:05:35</created_at> | |
</row> | |
""" | |
import xml.dom.minidom | |
from xml.dom.minidom import Node | |
doc = xml.dom.minidom.parse("journals.xml") | |
#doc = xml.dom.minidom.parseString(source.encode('utf-8')) | |
for journals in doc.getElementsByTagName("journals"): | |
rows = journals.getElementsByTagName("row") | |
for row in rows: | |
id = row.getElementsByTagName("id")[0].childNodes[0].data | |
date = row.getElementsByTagName("date")[0].childNodes[0].data | |
time = row.getElementsByTagName("time")[0].childNodes[0].data | |
timezone = row.getElementsByTagName("timezone")[0].childNodes[0].data | |
tag = row.getElementsByTagName("tag")[0].childNodes[0].data | |
subject = row.getElementsByTagName("subject")[0].childNodes[0].data | |
publishing_code = row.getElementsByTagName("publishing_code")[0].childNodes[0].data | |
body = row.getElementsByTagName("body")[0].childNodes[0].data | |
ref = row.getElementsByTagName("ref")[0].childNodes[0].data | |
created_at = row.getElementsByTagName("created_at")[0].childNodes[0].data | |
if id == "<null>": id = None | |
if date == "<null>": date = None | |
if time == "<null>": time = None | |
if timezone == "<null>": timezone = None | |
if tag == "<null>": tag = None | |
if subject == "<null>": subject = None | |
if publishing_code == "<null>": publishing_code = None | |
if body == "<null>": body = None | |
if ref == "<null>": ref = None | |
if created_at == "<null>": created_at = None | |
print id | |
print date | |
print time | |
print timezone | |
if tag is not None: print tag.encode('utf-8') | |
else: print tag | |
if subject is not None: print subject.encode('utf-8') | |
else: print subject | |
print publishing_code | |
if body is not None: print body.encode('utf-8') | |
else: print body | |
if ref is not None: print ref.encode('utf-8') | |
else: print ref | |
print created_at | |
# google appengine with so nice with utf-8. I did not need all those .encode methods for google appengine.:-) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment