Created
September 10, 2018 16:40
-
-
Save kemingy/c4a41054a8e5a2afdfc7effb1a2e1c9d to your computer and use it in GitHub Desktop.
decode sogou phrase
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import struct | |
| from collections import namedtuple | |
| Phrase = namedtuple('Phrase', ['word', 'pinyin', 'freq']) | |
| OFFSET_PINYIN = 0x1540 | |
| OFFSET_HAN = 0x2628 | |
| class Sogou: | |
| def __init__(self, file, single=False, scale=False): | |
| if not os.path.isfile(file): | |
| raise NameError("wrong file path") | |
| self.file = file | |
| self.pinyin = {} | |
| self.words = [] | |
| def extract_data(self): | |
| with open(self.file, 'rb') as f: | |
| data = f.read() | |
| if data[:12] != b'\x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00': | |
| print('Wrong format data.') | |
| return | |
| print('词库名称:\t', self.byte2str(data[0x130:0x338])) | |
| print('词库类型:\t', self.byte2str(data[0x338:0x540])) | |
| print('描述信息:\t', self.byte2str(data[0x540:0xd40])) | |
| print('词库示例:\t', self.byte2str(data[0xd40:OFFSET_PINYIN])) | |
| self.get_pinyin(data[OFFSET_PINYIN:OFFSET_HAN]) | |
| self.get_han(data[OFFSET_HAN:]) | |
| def save(self, path): | |
| with open(path, 'w', encoding='utf-8') as f: | |
| for word in self.words: | |
| f.write('{}\t{}\t{}\n', word.word, word.pinyin. word.freq) | |
| def byte2str(self, text): | |
| pos = 0 | |
| ans = '' | |
| while pos + 1 < len(text): | |
| num = struct.unpack('H', text[pos : pos + 2])[0] | |
| pos += 2 | |
| if num == 0: | |
| continue | |
| char = chr(num) | |
| ans += char | |
| return ans | |
| def get_pinyin(self, text): | |
| if text[:4] != b'\x9D\x01\x00\x00': | |
| return None | |
| text = text[4:] | |
| pos = 0 | |
| while pos < len(text): | |
| index = self.unpack(text[pos:pos+2]) | |
| pos += 2 | |
| length = self.unpack(text[pos:pos+2]) | |
| pos += 2 | |
| py = self.byte2str(text[pos:pos+length]) | |
| pos += length | |
| self.pinyin[index] = py | |
| def get_word(self, text): | |
| pos = 0 | |
| ans = '' | |
| while pos + 1 < len(text): | |
| index = self.unpack(text[pos:pos+2]) | |
| pos += 2 | |
| try: | |
| ans += self.pinyin[index] | |
| except KeyError as err: | |
| print('Get an error: {}\nIgnore this and continue...'.format(err)) | |
| continue | |
| return ans | |
| def get_han(self, text): | |
| pos = 0 | |
| while pos < len(text): | |
| same = self.unpack(text[pos:pos+2]) | |
| pos += 2 | |
| pinyin_len = self.unpack(text[pos:pos+2]) | |
| pos += 2 | |
| pinyin = self.get_word(text[pos:pos+pinyin_len]) | |
| pos += pinyin_len | |
| for _ in range(same): | |
| han_len = self.unpack(text[pos:pos+2]) | |
| pos += 2 | |
| word = self.byte2str(text[pos:pos+han_len]) | |
| pos += han_len | |
| ext_len = self.unpack(text[pos:pos+2]) | |
| pos += 2 | |
| # sogou phrase is tooooooooooooooooo bad... | |
| freq = int(self.unpack(text[pos:pos+2])) / 3 | |
| pos += ext_len | |
| if word: | |
| self.words.append(Phrase(word, pinyin, freq)) | |
| for w in word: | |
| if w in self.single_words: | |
| self.single_words[w] = (self.single_words[w] + freq) / 2 | |
| else: | |
| self.single_words[w] = freq | |
| def unpack(self, text): | |
| if len(text) != 2: | |
| return 0 | |
| return struct.unpack('H', text)[0] | |
| if __name__ == '__main__': | |
| sogou = Sogou('Downloads/网络流行新词【官方推荐】.scel') | |
| sogou.extract_data() | |
| for w in sogou.words[:5]: | |
| print(w) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment