cdpath · April 29, 2019 05:07
diff --git a/parser.py b/parser.py
 import os
 from functools import partial


 def is_class_of(tag, class_):
    try:
        return tag.attrs['class'][0] == class_
    except Exception:
        return False


 is_entry = partial(is_class_of, class_='subtitle4')
 is_pos = partial(is_class_of, class_='normaltext3')
 is_text = partial(is_class_of, class_='normaltext')
 is_picture = partial(is_class_of, class_='picture')


 def gen_word(tags):
    word = []
    for tag in tags:
        if word and is_entry(tag):
            yield word
            word = [tag]
        else:
            word.append(tag)


 def validate(w):
    assert len(w) > 2
    assert is_entry(w[0])
    assert is_pos(w[1])
    pos = w[1].text
    assert pos.startswith('[') and pos.endswith(']')
    for tag in w[2:]:
        assert is_text(tag) or is_picture(tag)


 def is_validate(w):
    try:
        validate(w)
    except AssertionError:
        return False
    return True


 def parse_example(example):
    chinese_pat = re.compile(r'[\u4e00-\u9fff]+')
    try:
        mixes = example.split("//")
        en = []
        zh = []
        for mix in mixes:
            m = list(chinese_pat.finditer(mix))[0]
            en.append(mix[:m.start()].strip())
            zh.append(mix[m.start():].strip())
    except Exception:
        print(example)
        raise ValueError("Failed to parse exmple for %s" % example)
    return en, zh


 def parse_word(w, error_words):
    word = {
        'entry': w[0].text,
        'pos': w[1].text,
        'definition': w[2].text,
    }
    try:
        word.update(dict(w.text.split('\u3000') for w in w[3:] if is_text(w)))
    except Exception:
        error_words.append(w)
    example = word.pop('例', None)
    if example:
        en, zh = parse_example(example)
        word['例'] = '<br>'.join(en)
        word['译'] = '<br>'.join(zh)
    return word


 def save_not_processed_words(words, filename):
    with open(filename, 'w') as f:
        for w in words:
            html_str = '\n'.join(str(t) for t in w)
            f.write(html_str + '\n')
    print("Saved %d broken words to %s" % (len(words), filename))


 def save_words(words, filename):
    with open(filename, 'w') as f:
        csv_writer = csv.writer(f)
        for w in words:
            csv_writer.writerow((
                w['entry'], w['pos'], w['definition'], 
                w.get('例'), w.get('译'), 
                w.get('记'), w.get('搭'), 
                w.get('派'), w.get('参')
            ))
    print("Saved %d words to %s" % (len(words), filename))


 def main(filename):
    name, ext = os.path.splitext(filename)
    out_f, err_f = name + '.csv', name + '.error.html'

    soup = BeautifulSoup(open(filename))
    ws = filter(lambda x:is_entry(x[0]), gen_word(soup.find_all('p')))

    words, remains = [], []
    for w in ws:
        if is_validate(w):
            words.append(w)
        else:
            remains.append(w)
    words = [parse_word(w, remains) for w in words]
    save_not_processed_words(remains, err_f)
    save_words(words, out_f)
	import os
	from functools import partial


	def is_class_of(tag, class_):
	try:
	return tag.attrs['class'][0] == class_
	except Exception:
	return False


	is_entry = partial(is_class_of, class_='subtitle4')
	is_pos = partial(is_class_of, class_='normaltext3')
	is_text = partial(is_class_of, class_='normaltext')
	is_picture = partial(is_class_of, class_='picture')


	def gen_word(tags):
	word = []
	for tag in tags:
	if word and is_entry(tag):
	yield word
	word = [tag]
	else:
	word.append(tag)


	def validate(w):
	assert len(w) > 2
	assert is_entry(w[0])
	assert is_pos(w[1])
	pos = w[1].text
	assert pos.startswith('[') and pos.endswith(']')
	for tag in w[2:]:
	assert is_text(tag) or is_picture(tag)


	def is_validate(w):
	try:
	validate(w)
	except AssertionError:
	return False
	return True


	def parse_example(example):
	chinese_pat = re.compile(r'[\u4e00-\u9fff]+')
	try:
	mixes = example.split("//")
	en = []
	zh = []
	for mix in mixes:
	m = list(chinese_pat.finditer(mix))[0]
	en.append(mix[:m.start()].strip())
	zh.append(mix[m.start():].strip())
	except Exception:
	print(example)
	raise ValueError("Failed to parse exmple for %s" % example)
	return en, zh


	def parse_word(w, error_words):
	word = {
	'entry': w[0].text,
	'pos': w[1].text,
	'definition': w[2].text,
	}
	try:
	word.update(dict(w.text.split('\u3000') for w in w[3:] if is_text(w)))
	except Exception:
	error_words.append(w)
	example = word.pop('例', None)
	if example:
	en, zh = parse_example(example)
	word['例'] = '<br>'.join(en)
	word['译'] = '<br>'.join(zh)
	return word


	def save_not_processed_words(words, filename):
	with open(filename, 'w') as f:
	for w in words:
	html_str = '\n'.join(str(t) for t in w)
	f.write(html_str + '\n')
	print("Saved %d broken words to %s" % (len(words), filename))


	def save_words(words, filename):
	with open(filename, 'w') as f:
	csv_writer = csv.writer(f)
	for w in words:
	csv_writer.writerow((
	w['entry'], w['pos'], w['definition'],
	w.get('例'), w.get('译'),
	w.get('记'), w.get('搭'),
	w.get('派'), w.get('参')
	))
	print("Saved %d words to %s" % (len(words), filename))


	def main(filename):
	name, ext = os.path.splitext(filename)
	out_f, err_f = name + '.csv', name + '.error.html'

	soup = BeautifulSoup(open(filename))
	ws = filter(lambda x:is_entry(x[0]), gen_word(soup.find_all('p')))

	words, remains = [], []
	for w in ws:
	if is_validate(w):
	words.append(w)
	else:
	remains.append(w)
	words = [parse_word(w, remains) for w in words]
	save_not_processed_words(remains, err_f)
	save_words(words, out_f)