Created
December 30, 2010 06:21
-
-
Save mohayonao/759520 to your computer and use it in GitHub Desktop.
Yahoo!辞書で言葉を調べる
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
import urllib | |
from collections import namedtuple | |
import BeautifulSoup | |
DictResult = namedtuple("YahooDictResult", "type lemma defins opts") | |
DictDefin = namedtuple("YahooDictDefin" , "index defin opts") | |
BASE_URL = 'http://dic.search.yahoo.co.jp/search' | |
QUERY_STRING = '?stype=exact&ei=UTF-8&p=%s' | |
def lookup(word, yomi=None, using=None): | |
"""Yahoo!辞書で言葉を調べる""" | |
if using is None: using = ('JJ', 'seiji', 'singo') | |
q = u' '.join((word, yomi)) if yomi else word | |
if isinstance(q, unicode): | |
q = q.encode('utf-8') | |
url = BASE_URL + QUERY_STRING % urllib.quote(q) | |
html = urllib.urlopen(url).read().decode('utf-8') | |
soup = BeautifulSoup.BeautifulSoup(html) | |
result = [ ] | |
for div in soup.findAll('div', {'class':'result-r'}): | |
h2 = div.find('h2', {'class': using}) | |
if not h2: continue | |
dict_type = h2['class'] | |
for li in div.findAll('li'): | |
parse_result = None | |
if dict_type == 'JJ': | |
parse_result = _dict_parse_for_YahooDictionary_JJ(li) | |
elif dict_type == 'seiji': | |
parse_result = _dict_parse_for_YahooDictionary_seiji(li) | |
elif dict_type == 'singo': | |
parse_result = _dict_parse_for_YahooDictionary_singo(li) | |
else: | |
pass | |
if not parse_result: continue | |
result.append(parse_result) | |
return result | |
def pp(results): | |
lines = [] | |
for r in results: | |
lines.append( u'辞書の種類: %s' % r.type ) | |
lines.append( u'見出し語 : %s' % r.lemma ) | |
if r.opts: | |
lines.append( u'その他の情報:' ) | |
for k, v in r.opts.iteritems(): | |
if not v: continue | |
lines.append( u' %s: %s' % (k, v) ) | |
for i in r.defins: | |
lines.append( u'%s:%s' % (i.index, i.defin) ) | |
if not i.opts: continue | |
for k, v in i.opts.iteritems(): | |
if not v: continue | |
if isinstance(v, list): v = ','.join(v) | |
lines.append( u' %s: %s' % (k, v) ) | |
lines.append( u'' ) | |
return '¥n'.join(lines) | |
# 長くなるので各辞書用の解析器はこの下に書く | |
def _dict_parse_for_YahooDictionary_JJ(elem): | |
"""国語辞典用の解析器""" | |
# 見出しとふりがなを取得する | |
m = re.search(ur'(.+?)【(.+?)】', elem.h3.a.text) | |
if m: | |
lemma = m.group(2) | |
yomi = m.group(1).replace(u'‐', '') | |
else: | |
lemma = elem.h3.a.text | |
yomi = None | |
cands = [ ] | |
idx, sub_idx = 0, 0 | |
for x in elem.div: | |
if isinstance(x, BeautifulSoup.NavigableString): | |
if sub_idx == -1: continue | |
x = x.strip() | |
if not x: continue | |
if cands and cands[-1].index == (idx, sub_idx): | |
new_cands = cands[-1].defin + x | |
cands[-1] = DictDefin(cands[-1].index, new_cands, {}) | |
else: | |
cands.append( DictDefin((idx, sub_idx), x, {}) ) | |
elif isinstance(x, BeautifulSoup.Tag): | |
if x.name == 'img' and x.has_key('src'): | |
src = x['src'] | |
if src.endswith('.gif'): | |
try: | |
src = src[src.rindex('/')+1:-4] | |
gaiji_idx = int(src) | |
except ValueError: | |
continue | |
sub_idx = -1 | |
if gaiji_idx == 1676: sub_idx = 1 # まる1 | |
if gaiji_idx == 1678: sub_idx = 2 # まる2 | |
if gaiji_idx == 2513: sub_idx = 3 # まる3 | |
if gaiji_idx == 2531: sub_idx = 1 # 白四角1 | |
if gaiji_idx == 2539: sub_idx = 1 # 黒四角1 | |
if gaiji_idx == 2540: pass # 黒四角2 | |
elif x.name == 'b': | |
i = ord(x.text[0]) - 0xff10 | |
if 1 <= i < 10: | |
idx = i | |
elif cands and cands[-1].index == (idx, sub_idx): | |
new_cands = cands[-1].defin + x.text | |
cands[-1] = DictDefin(cands[-1].index, new_cands, {}) | |
else: | |
cands.append( DictDefin((idx, sub_idx), x.text, {}) ) | |
# 各定義に含まれる副次的な情報を抜き出す | |
# 品詞情報、用例、対義語、リンクなど | |
hinshi = None | |
defin = [ ] | |
for x in cands: | |
text = x.defin | |
m = re.match(ur'[¥D+?](¥(スル¥))?', text) | |
if m: | |
hinshi = m.group(0) | |
text = text.replace(hinshi, '') | |
hint = re.search(ur'《.+?》', text) | |
if hint: | |
hint = hint.group(0) | |
text = text.replace(hint, '') | |
hint = None | |
yourei = re.findall(ur'(「(?:[^」]*?)—(?:[^「]*?)」)', text) | |
for i in yourei: text = text.replace(i, '') | |
taigigo = None | |
if u'⇔' in text: | |
taigigo = text[text.rindex(u'⇔')+1:] | |
if taigigo.endswith(u'。'): taigigo = taigigo[:-1] | |
text = text[:text.rindex(u'⇔')] | |
link = None | |
if u'⇒' in text: | |
link = text[text.rindex(u'⇒')+1:] | |
if link.endswith(u'。'): link = link[:-1] | |
text = text[:text.rindex(u'⇒')] | |
elif u'→' in text: | |
link = text[text.rindex(u'→')+1:] | |
if link.endswith(u'。'): link = link[:-1] | |
text = text[:text.rindex(u'→')] | |
if text or hint or yourei or taigigo or link: | |
defin_opts = dict(hint=hint, yourei=yourei, taigigo=taigigo, link=link) | |
defin.append( DictDefin(x.index, text, defin_opts) ) | |
opts = dict(yomi=yomi, hinshi=hinshi) | |
return DictResult('JJ', lemma, defin, opts) | |
def _dict_parse_for_YahooDictionary_seiji(elem): | |
"""政治用語集の解析器""" | |
lemma = elem.h3.a.text | |
defin = [ DictDefin( (0, 0), elem.div.text, {}) ] | |
return DictResult('seiji', lemma, defin, {}) | |
def _dict_parse_for_YahooDictionary_singo(elem): | |
"""新語探検の解析器""" | |
lemma = elem.h3.a.text | |
defin = [ DictDefin( (0, 0), elem.div.text, {}) ] | |
return DictResult('singo', lemma, defin, {}) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment