Skip to content

Instantly share code, notes, and snippets.

@hzqtc
Last active December 16, 2015 15:39
Show Gist options
  • Save hzqtc/5457994 to your computer and use it in GitHub Desktop.
Save hzqtc/5457994 to your computer and use it in GitHub Desktop.
Command line English-Chinese dictionary. Data from http://www.iciba.com. Requires lxml, clint and mpg123 (for pronouncing only).
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import getopt
import lxml.html
import re
import subprocess
import sys
import urllib
from clint.textui import puts, indent, colored
class WordPronunciation(object):
def __init__(self):
self.phonetic_symbol = ''
self.sound_url = ''
class WordDefinition(object):
def __init__(self):
self.explaination = ''
self.examples = []
class Word(object):
def __init__(self):
self.spell = ''
self.pronunciation = {}
self.definition = []
def pronounce(self, prefer = 'american', repeat = 3):
if prefer in self.pronunciation and self.pronunciation[prefer].sound_url != '':
self.play(self.pronunciation[prefer].sound_url, repeat)
else:
for k in self.pronunciation:
if self.pronunciation[k].sound_url != '':
self.play(self.pronunciation[k].sound_url, repeat)
def play(self, url, repeat):
subprocess.call(['mpg123', '--loop', str(repeat), '-q', url], stderr = open('/dev/null'))
def display(self):
puts('%s\t[%s]' % (colored.blue('Pronunciation:'), colored.red(self.spell)))
for k in self.pronunciation:
with indent(2):
puts((u'%s %s\t[%s]\n %s' % (colored.magenta(u'◆'), k.capitalize(),
colored.red(self.pronunciation[k].phonetic_symbol),
colored.yellow(self.pronunciation[k].sound_url))).encode('utf-8'))
puts()
puts(colored.blue('Definiteion:'))
for d in self.definition:
with indent(2):
puts((u'%s %s' % (colored.magenta(u'◆'), highlight_word(d.explaination, self.spell))).encode('utf-8'))
puts()
for e in d.examples:
with indent(2):
puts((u'%s\n%s' % (colored.yellow(e[0]), colored.green(e[1]))).encode('utf-8'))
puts()
def highlight_word(s, word):
cap_word = word.capitalize()
return s.replace(word, str(colored.red(word))).replace(cap_word, str(colored.red(cap_word)))
def strip_mid(s):
return re.sub('\s+', ' ', s)
def parseWord(word_spell):
url = "http://www.iciba.com/%s" % urllib.quote_plus(word_spell)
html = urllib.urlopen(url).read()
doc = lxml.html.fromstring(html)
word = Word()
try:
word.spell = doc.xpath('//h1[@id="word_name_h1"]/text()')[0]
pronuns = doc.xpath('//div[@id="dict_main"]/div[@class="dictbar"]//span[@class="eg"]')
if pronuns:
british_eng = WordPronunciation()
try:
british_eng.phonetic_symbol = pronuns[0].xpath('span[@class="fl"]/strong')[1].text
british_eng.sound_url = pronuns[0].xpath('a[@class="ico_sound"]/@onclick')[0].split("'")[1]
except:
pass
word.pronunciation['british'] = british_eng
if len(pronuns) > 1:
american_eng = WordPronunciation()
try:
american_eng.phonetic_symbol = pronuns[1].xpath('span[@class="fl"]/strong')[1].text
american_eng.sound_url = pronuns[1].xpath('div[@class="vCri"]/a[@class="vCri_laba"]/@onclick')[0].split("'")[1]
except:
pass
word.pronunciation['american'] = american_eng
# remove trobulesome elements
map(lambda e: e.getparent().remove(e), doc.xpath('//div[@class="collins_en_cn"]//span[@class="num"]'))
map(lambda e: e.getparent().remove(e), doc.xpath('//div[@class="collins_en_cn"]//div[@class="tips_main"]'))
map(lambda e: e.getparent().remove(e), doc.xpath('//div[@class="collins_en_cn"]//span[@class="tips_box"]'))
# start parsing collins
defs = doc.xpath('//div[@class="collins_en_cn"]')
for d in defs:
word_def = WordDefinition()
caption = d.xpath('div[@class="caption"]')
if caption:
word_def.explaination = strip_mid(caption[0].text_content()).strip()
for e in d.xpath('ul/li[not(@class)]'):
examp_en, examp_cn = map(lambda s: strip_mid(s.text_content()).strip(), e.xpath('p'))
word_def.examples.append((examp_en, examp_cn))
word.definition.append(word_def)
except Exception as e:
print str(e)
return word
def usage():
print "Command line English-Chinese Dictionary. Data comes from http://www.iciba.com/."
print "Usage: dict-cli.py [options] [word]"
print " -h, --help Print this infomation."
print " -n, --nopronounce Don't pronounce [default do]."
print " -p, --prefer=b|a Prefer British or American pronunciation [default American]."
print " -r, --repeat=NUM Repeat times [default 3, 0 for infinite loop]."
if __name__ == '__main__':
try:
opts, args = getopt.getopt(sys.argv[1:], 'hnp:r:', ['help', 'nopronounce', 'prefer=', 'repeat='])
except getopt.GetoptError as err:
print str(err)
usage()
sys.exit(2)
pronounce = True
prefer = 'american'
repeat = 3
for o, a in opts:
if o in ('-h', '--help'):
usage()
sys.exit()
elif o in ('-n', '--nopronounce'):
pronounce = False
elif o in ('-p', '--prefer') and a in ('a', 'b', 'american', 'british'):
if a == 'a':
prefer = 'american'
elif a == 'b':
prefer = 'british'
else:
prefer = a
elif o in ('-r', '--repeat') and int(a) >= 0:
repeat = int(a)
if not args:
usage()
sys.exit(2)
word = parseWord(args[0])
word.display()
if pronounce:
word.pronounce(prefer, repeat)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment