knzm · May 8, 2011 14:29
diff --git a/dl_tia_list.py b/dl_tia_list.py
 # -*- coding: utf-8 -*-

 import logging
 import urllib
 import lxml.html

 log = None

 list_url = "http://www.comitia.co.jp/history/96list.html"

 def init_log():
    global log
    if log is None:
        log = logging.getLogger()

 def fetch(url):
    f = urllib.urlopen(url)
    try:
        return f.read()
    finally:
        f.close()

 def read_dict(f):
    dict_ = []
    for line in f:
        read, name = line.decode('shift-jis').split("\t")[:2]
        dict_.append({"read": read, "name": name})
    return dict_

 def normalize_initial(ch):
    if ch in u"がぎぐげござじずぜぞだぢづでどばびぶべぼ":
        ch = unichr(ord(ch) - 1)
    elif ch in u"ぱぴぷぺぽ":
        ch = unichr(ord(ch) - 2)
    else:
        ch = {u"ヴ": u"う"}.get(ch, ch)
    return ch

 def find_read(name, read_from_name):
    if name in read_from_name:
        return read_from_name[name]
    name = name.replace(u"～", u"〜").replace(u"＊", u"*") \
        .replace(u"”", '"').replace(u"＜", "<").replace(u"＞", ">") \
        .replace(u"！", "!").replace(u"？", "?").replace(u"＿", "_")
    if name in read_from_name:
        return read_from_name[name]
    return None

 def main(dict_=None):
    init_log()

    content = fetch(list_url)
    doc = lxml.html.fromstring(content)

    read_from_name = {}
    if dict_:
        for d in dict_:
            name = d["name"]
            if name in read_from_name:
                log.warn(u"Duplicated: %s" % name)
                continue
            read_from_name[name] = d["read"]

    unused = {}
    for name, read in read_from_name.iteritems():
        unused[read] = name

    d = {}
    nodes = doc.cssselect("#circle_list_container div.circle_list")
    for div in nodes[1:]:
        pos, name = div.text_content().split(None, 1)
        d[pos] = {"name": name}
        anchors = div.cssselect("a")
        if len(anchors) > 0:
            href = anchors[0].attrib.get("href")
            d[pos]["url"] = href
        read = find_read(name, read_from_name)
        if read:
            d[pos]["read"] = read
            d[pos]["initial"] = normalize_initial(read[:1])
            try:
                unused.pop(read)
            except KeyError:
                pass

    if unused:
        for read, name in sorted(unused.iteritems()):
            log.warn(u"Unused: %s (%s)" % (name, read))

    def order_by_area(pair):
        pos, value = pair
        area = pos[0]
        if u"Ａ" <= area <= u"Ｚ":
            area_code = 0
        elif u"あ" <= area <= u"ん":
            area_code = 1
        else:
            area_code = 2
        return (area_code, pos)

    def order_by_name(pair):
        pos, value = pair
        initial = value.get("initial", "")
        read = value.get("read", "")
        name = value.get("name", "")
        return (initial == "", read, name)

    for pos, value in sorted(d.items(), key=order_by_name):
        name = value.get("name")
        read = value.get("read", "")
        url = value.get("url", "")
        initial = value.get("initial", "")
        line = u"%s\t%s\t%s\t%s\t%s" % (initial, pos, name, read, url)
        print line.encode('utf-8')


 if __name__ == '__main__':
    import sys

    logging.basicConfig(
        format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
        datefmt="%H:%M:%S",
        level=logging.INFO,
        )

    dict_ = None
    if len(sys.argv) > 1:
        f = open(sys.argv[1])
        try:
            dict_ = read_dict(f)
        finally:
            f.close()
    main(dict_)
	# -- coding: utf-8 --

	import logging
	import urllib
	import lxml.html

	log = None

	list_url = "http://www.comitia.co.jp/history/96list.html"

	def init_log():
	global log
	if log is None:
	log = logging.getLogger()

	def fetch(url):
	f = urllib.urlopen(url)
	try:
	return f.read()
	finally:
	f.close()

	def read_dict(f):
	dict_ = []
	for line in f:
	read, name = line.decode('shift-jis').split("\t")[:2]
	dict_.append({"read": read, "name": name})
	return dict_

	def normalize_initial(ch):
	if ch in u"がぎぐげござじずぜぞだぢづでどばびぶべぼ":
	ch = unichr(ord(ch) - 1)
	elif ch in u"ぱぴぷぺぽ":
	ch = unichr(ord(ch) - 2)
	else:
	ch = {u"ヴ": u"う"}.get(ch, ch)
	return ch

	def find_read(name, read_from_name):
	if name in read_from_name:
	return read_from_name[name]
	name = name.replace(u"～", u"〜").replace(u"＊", u"*") \
	.replace(u"”", '"').replace(u"＜", "<").replace(u"＞", ">") \
	.replace(u"！", "!").replace(u"？", "?").replace(u"＿", "_")
	if name in read_from_name:
	return read_from_name[name]
	return None

	def main(dict_=None):
	init_log()

	content = fetch(list_url)
	doc = lxml.html.fromstring(content)

	read_from_name = {}
	if dict_:
	for d in dict_:
	name = d["name"]
	if name in read_from_name:
	log.warn(u"Duplicated: %s" % name)
	continue
	read_from_name[name] = d["read"]

	unused = {}
	for name, read in read_from_name.iteritems():
	unused[read] = name

	d = {}
	nodes = doc.cssselect("#circle_list_container div.circle_list")
	for div in nodes[1:]:
	pos, name = div.text_content().split(None, 1)
	d[pos] = {"name": name}
	anchors = div.cssselect("a")
	if len(anchors) > 0:
	href = anchors[0].attrib.get("href")
	d[pos]["url"] = href
	read = find_read(name, read_from_name)
	if read:
	d[pos]["read"] = read
	d[pos]["initial"] = normalize_initial(read[:1])
	try:
	unused.pop(read)
	except KeyError:
	pass

	if unused:
	for read, name in sorted(unused.iteritems()):
	log.warn(u"Unused: %s (%s)" % (name, read))

	def order_by_area(pair):
	pos, value = pair
	area = pos[0]
	if u"Ａ" <= area <= u"Ｚ":
	area_code = 0
	elif u"あ" <= area <= u"ん":
	area_code = 1
	else:
	area_code = 2
	return (area_code, pos)

	def order_by_name(pair):
	pos, value = pair
	initial = value.get("initial", "")
	read = value.get("read", "")
	name = value.get("name", "")
	return (initial == "", read, name)

	for pos, value in sorted(d.items(), key=order_by_name):
	name = value.get("name")
	read = value.get("read", "")
	url = value.get("url", "")
	initial = value.get("initial", "")
	line = u"%s\t%s\t%s\t%s\t%s" % (initial, pos, name, read, url)
	print line.encode('utf-8')


	if __name__ == '__main__':
	import sys

	logging.basicConfig(
	format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
	datefmt="%H:%M:%S",
	level=logging.INFO,
	)

	dict_ = None
	if len(sys.argv) > 1:
	f = open(sys.argv[1])
	try:
	dict_ = read_dict(f)
	finally:
	f.close()
	main(dict_)
No results found