Skip to content

Instantly share code, notes, and snippets.

@cdpath
Created April 29, 2019 05:07
Show Gist options
  • Save cdpath/aa8ec12acc766a878b3f931e2de96aa5 to your computer and use it in GitHub Desktop.
Save cdpath/aa8ec12acc766a878b3f931e2de96aa5 to your computer and use it in GitHub Desktop.
托福红宝书
import os
from functools import partial
def is_class_of(tag, class_):
try:
return tag.attrs['class'][0] == class_
except Exception:
return False
is_entry = partial(is_class_of, class_='subtitle4')
is_pos = partial(is_class_of, class_='normaltext3')
is_text = partial(is_class_of, class_='normaltext')
is_picture = partial(is_class_of, class_='picture')
def gen_word(tags):
word = []
for tag in tags:
if word and is_entry(tag):
yield word
word = [tag]
else:
word.append(tag)
def validate(w):
assert len(w) > 2
assert is_entry(w[0])
assert is_pos(w[1])
pos = w[1].text
assert pos.startswith('[') and pos.endswith(']')
for tag in w[2:]:
assert is_text(tag) or is_picture(tag)
def is_validate(w):
try:
validate(w)
except AssertionError:
return False
return True
def parse_example(example):
chinese_pat = re.compile(r'[\u4e00-\u9fff]+')
try:
mixes = example.split("//")
en = []
zh = []
for mix in mixes:
m = list(chinese_pat.finditer(mix))[0]
en.append(mix[:m.start()].strip())
zh.append(mix[m.start():].strip())
except Exception:
print(example)
raise ValueError("Failed to parse exmple for %s" % example)
return en, zh
def parse_word(w, error_words):
word = {
'entry': w[0].text,
'pos': w[1].text,
'definition': w[2].text,
}
try:
word.update(dict(w.text.split('\u3000') for w in w[3:] if is_text(w)))
except Exception:
error_words.append(w)
example = word.pop('例', None)
if example:
en, zh = parse_example(example)
word['例'] = '<br>'.join(en)
word['译'] = '<br>'.join(zh)
return word
def save_not_processed_words(words, filename):
with open(filename, 'w') as f:
for w in words:
html_str = '\n'.join(str(t) for t in w)
f.write(html_str + '\n')
print("Saved %d broken words to %s" % (len(words), filename))
def save_words(words, filename):
with open(filename, 'w') as f:
csv_writer = csv.writer(f)
for w in words:
csv_writer.writerow((
w['entry'], w['pos'], w['definition'],
w.get('例'), w.get('译'),
w.get('记'), w.get('搭'),
w.get('派'), w.get('参')
))
print("Saved %d words to %s" % (len(words), filename))
def main(filename):
name, ext = os.path.splitext(filename)
out_f, err_f = name + '.csv', name + '.error.html'
soup = BeautifulSoup(open(filename))
ws = filter(lambda x:is_entry(x[0]), gen_word(soup.find_all('p')))
words, remains = [], []
for w in ws:
if is_validate(w):
words.append(w)
else:
remains.append(w)
words = [parse_word(w, remains) for w in words]
save_not_processed_words(remains, err_f)
save_words(words, out_f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment