Created
June 30, 2012 02:00
-
-
Save 7shi/3021786 to your computer and use it in GitHub Desktop.
Pythonの簡易XMLパーサとC++高速版
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# public domain | |
from StringIO import * | |
def replaces(s, args): | |
for key, value in args.iteritems(): | |
s = s.replace(key, value) | |
return s | |
def from_entity(s): | |
return replaces(s, { | |
"<" : '<', | |
">" : '>', | |
""": '"', | |
" ": ' ', | |
"&" : '&'}) | |
class reader: | |
pos = 0 | |
reserved = "" | |
text = "" | |
tag = "" | |
values = {} | |
def __init__(self, src): | |
self.src = src | |
def __getitem__(self, key): | |
return self.values[key] | |
def has_key(self, key): | |
return self.values.has_key(key) | |
def check(self, tag, values): | |
if tag != self.tag: return False | |
for key, value in values.iteritems(): | |
k = unicode(key) | |
if not self.has_key(k) or self[k] != value: | |
return False | |
return True | |
def find(self, tag, values = {}): | |
while self.read(): | |
if self.check(tag, values): | |
return True | |
return False | |
def each(self, tag = "", values = {}): | |
end = "/" + self.tag | |
i = 0 | |
while self.tag != end and self.read(): | |
if tag == "" or self.check(tag, values): | |
yield i | |
i += 1 | |
def read(self): | |
self.text = "" | |
self.tag = "" | |
self.values = {} | |
if self.pos >= len(self.src): | |
return False | |
elif self.reserved != "": | |
self.tag = self.reserved | |
self.reserved = "" | |
else: | |
self.read_text() | |
return True | |
def read_text(self): | |
p = self.src.find("<", self.pos) | |
if p < 0: | |
self.text = from_entity(self.src[self.pos:]) | |
self.pos = len(self.src) | |
else: | |
self.text = from_entity(self.src[self.pos:p]) | |
self.pos = p + 1 | |
self.read_tag() | |
def read_char(self): | |
if self.pos >= len(self.src): | |
self.cur = "" | |
else: | |
self.cur = self.src[self.pos] | |
self.pos += 1 | |
return self.cur | |
def read_tag(self): | |
t = StringIO() | |
while self.read_char() != "": | |
ch = self.cur | |
if ch == ">" or (ch == "/" and t.pos > 0): | |
break | |
elif ch > " ": | |
t.write(ch) | |
if t.pos == 3 and t.getvalue() == "!--": | |
break | |
elif t.pos > 0: | |
break | |
self.tag = t.getvalue().lower() | |
t.close() | |
if ch == "/": | |
self.reserved = "/" + self.tag | |
ch = self.read_char() | |
if ch != ">": | |
if self.tag == "!--": | |
self.read_comment() | |
else: | |
while self.read_values(): pass | |
def read_comment(self): | |
p = self.src.find("-->", self.pos) | |
if p < 0: | |
self.values["comment"] = self.src[self.pos:] | |
self.pos = len(self.src) | |
else: | |
self.values["comment"] = self.src[self.pos:p] | |
self.pos = p + 3 | |
def read_values(self): | |
nm = self.read_value(True).lower() | |
if nm == "": return False | |
if self.cur == "/": | |
self.reserved = "/" + self.tag | |
if self.cur == "=": | |
self.values[nm] = self.read_value(False) | |
else: | |
self.values[nm] = "" | |
return self.cur != ">" | |
def read_value(self, isleft): | |
v = StringIO() | |
while self.read_char() != "": | |
ch = self.cur | |
if ch == ">" or (isleft and (ch == "=" or ch == "/")): | |
break | |
elif ch == '"': | |
while self.read_char() != "": | |
if self.cur == '"': break | |
v.write(self.cur) | |
break | |
elif ch > " ": | |
v.write(ch) | |
elif v.pos > 0: | |
break | |
ret = v.getvalue() | |
v.close() | |
return ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// public domain | |
#include <string> | |
#include <map> | |
#include <cstdio> | |
#include <cctype> | |
using namespace std; | |
struct reader { | |
int pos, cur; | |
wstring src, reserved, text, tag; | |
map<wstring, wstring> values; | |
}; | |
extern "C" { | |
reader *alloc(const wchar_t *); | |
void release(reader *); | |
const wchar_t *gettext(reader *); | |
const wchar_t *gettag(reader *); | |
const wchar_t *getitem(reader *, const wchar_t *); | |
bool has_key(reader *, const wchar_t *); | |
bool check(reader *, const wchar_t *, const wchar_t **); | |
bool find(reader *, const wchar_t *, const wchar_t **); | |
bool read(reader *); | |
} | |
static void read_text(reader *); | |
static int read_char(reader *); | |
static void read_tag(reader *); | |
static void read_comment(reader *); | |
static bool read_values(reader *); | |
static wstring read_value(reader *, bool); | |
static wstring replace(const wstring &src, const wstring &s1, const wstring &s2) { | |
wstring ret; | |
for (int p = 0;;) { | |
int pp = src.find(s1, p); | |
if (pp < 0) { | |
ret += src.substr(p); | |
break; | |
} | |
ret += src.substr(p, pp - p); | |
ret += s2; | |
p = pp + s1.size(); | |
} | |
return ret; | |
} | |
static wstring lower(const wstring &src) { | |
wstring ret; | |
for (auto it = src.begin(); it != src.end(); ++it) { | |
auto ch = *it; | |
ret += islower(ch) ? tolower(ch) : ch; | |
} | |
return ret; | |
} | |
static wstring from_entity(wstring s) { | |
s = replace(s, L"<", L"<"); | |
s = replace(s, L">", L">"); | |
s = replace(s, L""", L"\""); | |
s = replace(s, L" ", L" "); | |
s = replace(s, L"&", L"&"); | |
return s; | |
} | |
reader *alloc(const wchar_t *src) { | |
auto ret = new reader; | |
ret->pos = 0; | |
ret->cur = -1; | |
ret->src = src; | |
return ret; | |
} | |
void release(reader *self) { | |
delete self; | |
} | |
const wchar_t *gettext(reader *self) { | |
return self->text.c_str(); | |
} | |
const wchar_t *gettag(reader *self) { | |
return self->tag.c_str(); | |
} | |
const wchar_t *getitem(reader *self, const wchar_t *key) { | |
auto it = self->values.find(key); | |
return it != self->values.end() ? it->second.c_str() : NULL; | |
} | |
bool has_key(reader *self, const wchar_t *key) { | |
return self->values.find(key) != self->values.end(); | |
} | |
bool check(reader *self, const wchar_t *tag, const wchar_t **values) { | |
if (self->tag != tag) return false; | |
for (auto it = values; it[0]; it += 2) { | |
auto it2 = self->values.find(it[0]); | |
if (it2 == self->values.end() || it2->second != it[1]) | |
return false; | |
} | |
return true; | |
} | |
bool find(reader *self, const wchar_t *tag, const wchar_t **values) { | |
while (read(self)) | |
if (check(self, tag, values)) | |
return true; | |
return false; | |
} | |
bool read(reader *self) { | |
self->text.clear(); | |
self->tag.clear(); | |
self->values.clear(); | |
if (self->pos >= self->src.size()) | |
return false; | |
else if (!self->reserved.empty()) { | |
self->tag = self->reserved; | |
self->reserved.clear(); | |
} else | |
read_text(self); | |
return true; | |
} | |
static void read_text(reader *self) { | |
int p = self->src.find('<', self->pos); | |
if (p < 0) { | |
self->text = from_entity(self->src.substr(self->pos)); | |
self->pos = self->src.size(); | |
} else { | |
self->text = from_entity(self->src.substr(self->pos, p - self->pos)); | |
self->pos = p + 1; | |
read_tag(self); | |
} | |
} | |
static int read_char(reader *self) { | |
if (self->pos >= self->src.size()) | |
self->cur = -1; | |
else { | |
self->cur = self->src[self->pos]; | |
self->pos++; | |
} | |
return self->cur; | |
} | |
static void read_tag(reader *self) { | |
wstring t; | |
wchar_t ch; | |
while (read_char(self) != -1) { | |
ch = self->cur; | |
if (ch == '>' || (ch == '/' && !t.empty())) | |
break; | |
else if (ch > ' ') { | |
t += ch; | |
if (t == L"!--") | |
break; | |
} else if (!t.empty()) | |
break; | |
} | |
self->tag = lower(t); | |
if (ch == '/') { | |
self->reserved = L"/" + self->tag; | |
ch = read_char(self); | |
} | |
if (ch != '>') { | |
if (self->tag == L"!--") | |
read_comment(self); | |
else | |
while (read_values(self)); | |
} | |
} | |
static void read_comment(reader *self) { | |
int p = self->src.find(L"-->", self->pos); | |
if (p < 0) { | |
self->values[L"comment"] = self->src.substr(self->pos); | |
self->pos = self->src.size(); | |
} else { | |
self->values[L"comment"] = self->src.substr(self->pos, p - self->pos); | |
self->pos = p + 3; | |
} | |
} | |
static bool read_values(reader *self) { | |
wstring nm = lower(read_value(self, true)); | |
if (nm.empty()) return false; | |
if (self->cur == '/') | |
self->reserved = L"/" + self->tag; | |
if (self->cur == '=') | |
self->values[nm] = read_value(self, false); | |
else | |
self->values[nm] = L""; | |
return self->cur != '>'; | |
} | |
static wstring read_value(reader *self, bool isleft) { | |
wstring v; | |
while (read_char(self) != -1) { | |
wchar_t ch = self->cur; | |
if (ch == '>' || (isleft && (ch == '=' || ch == '/'))) | |
break; | |
else if (ch == '"') { | |
while (read_char(self) != -1) { | |
if (self->cur == '"') break; | |
v += self->cur; | |
} | |
break; | |
} else if (ch > ' ') | |
v += ch; | |
else if (!v.empty()) | |
break; | |
} | |
return v; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# public domain | |
from ctypes import * | |
def getfunc(dll, res, name, arg): | |
ret = dll[name] | |
ret.restype = res | |
ret.argtypes = arg | |
return ret | |
dll = cdll.LoadLibrary("xml7shi2.dll") | |
alloc = getfunc(dll, c_void_p , "alloc" , [c_wchar_p]) | |
release = getfunc(dll, None , "release", [c_void_p]) | |
gettext = getfunc(dll, c_wchar_p, "gettext", [c_void_p]) | |
gettag = getfunc(dll, c_wchar_p, "gettag" , [c_void_p]) | |
getitem = getfunc(dll, c_wchar_p, "getitem", [c_void_p, c_wchar_p]) | |
has_key = getfunc(dll, c_bool , "has_key", [c_void_p, c_wchar_p]) | |
check = getfunc(dll, c_bool , "check" , [c_void_p, c_wchar_p, POINTER(c_wchar_p)]) | |
find = getfunc(dll, c_bool , "find" , [c_void_p, c_wchar_p, POINTER(c_wchar_p)]) | |
read = getfunc(dll, c_bool , "read" , [c_void_p]) | |
def convmap(src): | |
ret = (c_wchar_p * (len(src) + 1))() | |
i = 0 | |
for key, value in src.iteritems(): | |
ret[i] = key | |
ret[i + 1] = value | |
i += 2 | |
return ret | |
class reader: | |
text = property(lambda self: gettext(self.cself)) | |
tag = property(lambda self: gettag (self.cself)) | |
def __init__(self, src): | |
self.cself = alloc(src) | |
def __del__(self): | |
release(self.cself) | |
del self.cself | |
def __getitem__(self, key): | |
return getitem(self.cself, key) | |
def has_key(self, key): | |
return has_key(self.cself, key) | |
def find(self, tag, values = {}): | |
return find(self.cself, tag, convmap(values)) | |
def each(self, tag = "", values = {}): | |
end = "/" + self.tag | |
i = 0 | |
vmap = convmap(values) | |
while self.tag != end and self.read(): | |
if tag == "" or check(self.cself, tag, vmap): | |
yield i | |
i += 1 | |
def read(self): | |
return read(self.cself) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Python 3 version