Created
June 12, 2011 14:32
-
-
Save PeterDing/1021610 to your computer and use it in GitHub Desktop.
rearranging the record that stardict saved to readable lines.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
######################################################### | |
# I wrote the little script depended on following dictionaries: | |
# stardict-cced-2.4.2.tar.bz2, stardict-langdao-ec-gb-2.4.2.tar.bz2, | |
# stardict-ncce-ec-2.4.2.tar.bz2, stardict-xiangya-medical-2.4.2.tar.bz2 | |
# | |
# If you want to use other dictionary you will add or remove parts of the code. | |
# | |
# Default is designed to remove the record of <Collins Cobuild English Dictionary> | |
# for Chinese users. | |
######################################################## | |
import string | |
class cnvt(): | |
def __init__(self, filename): | |
self.source = open(filename).read() | |
self.dict = {} | |
def select(self): | |
dict_num = len(self.source.split('<--- Collins Cobuild English Dictionary --->')) | |
i = 0 | |
for a in range(dict_num): | |
# selecting Collins by sizes of one word defination | |
i = self.source.find('<--- Collins Cobuild English Dictionary --->', i) | |
if i != -1: | |
ii = self.source.find('<---', i + 4) | |
tmp = self.source[i:ii] | |
if ('\n\n' in tmp and tmp.split('\n\n')[-1][0].isdigit()) \ | |
or ('\n\n' in tmp and '=>' in tmp) or ('\n\n' not in tmp): | |
self.source = self.source.replace(tmp, '') | |
i = ii | |
def select2(self): | |
dict_num = len(self.source.split('*')) | |
i = 0 | |
for a in range(dict_num): | |
# selecting quota by sizes of one word defination | |
i = self.source.find('*', i) | |
if i != -1: | |
ii = self.source.find('\n', i + 1) | |
tmp = self.source[i:ii + 1] | |
self.source = self.source.replace(tmp, '') | |
i = ii | |
def make_dict(self): | |
ldict = self.source.split('\n\n') | |
if ldict[-1] == '': | |
del ldict[-1] | |
print 'There are %d words!' % len(ldict) | |
words_list = [] | |
for line in ldict: | |
tmp = line.split('\n') | |
if tmp[-1] == '': | |
del tmp[-1] | |
head = tmp[0] # selecting word | |
del tmp[0] | |
words_list.append(head) | |
if '<---' not in line: | |
print 'Error! ---> %s' % head | |
dict_level2 = {} | |
dict_name = '' | |
for item in tmp: | |
if '<---' in item: | |
dict_name = item | |
dict_level2[item] = '' | |
else: | |
try: | |
if 'Collins Cobuild' in item: | |
dict_level2[dict_name] = dict_level2[dict_name] + ',' + item | |
else: | |
word_is_redundant = True | |
y = item.replace(' ', '') | |
y = y.replace('-', '') | |
for i in range(len(y) - 1): | |
a = ord(y[i]) | |
if 48 <= a <= 122: pass | |
else: | |
word_is_redundant = False | |
break | |
if word_is_redundant: pass | |
else: | |
dict_level2[dict_name] = dict_level2[dict_name] + ',' + item | |
except KeyError: | |
print 'Error! ---> %s' % head | |
self.dict[head] = dict_level2 | |
# comparing sizes of defination of each words, then recording definations | |
dict_file = open('out', 'w') | |
for word in words_list: | |
defines = self.dict[word] | |
lengths = {} | |
for dict_name in defines.keys(): | |
lengths[len(defines[dict_name])] = dict_name | |
sizes = lengths.keys() | |
sizes.sort() | |
try: | |
record = defines[lengths[sizes[-1]]] # biggest defination | |
except IndexError: | |
print '<@_@>!!!' + ' --->\t' + word + ' ==> ' + str(defines) | |
record = ' <@_@>!!! ' | |
dict_file.write(word + '\t\t' + '---- ' + record[1:] + '.' + '\n') | |
dict_file.write('\n\n\n\\\\ original words:\n') | |
for word in words_list: | |
# recording original words | |
dict_file.write(word + '\n') | |
dict_file.close() | |
if __name__ == '__main__': | |
import sys, os | |
argv = sys.argv | |
if len(argv) == 1: | |
try: | |
user_home = os.popen('printenv HOME').read()[:-1] | |
do = cnvt(user_home + '/dic.txt') | |
except IOError: | |
print "Warning! Indicating the right address of stardict record's file)" | |
else: | |
do = cnvt(sys.argv[1]) | |
do.select2() | |
do.make_dict() | |
print 'Outputing file is at %s\n' % os.getcwd() | |
raw_input() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment