Skip to content

Instantly share code, notes, and snippets.

@kemingy
Created September 10, 2018 16:40
Show Gist options
  • Select an option

  • Save kemingy/c4a41054a8e5a2afdfc7effb1a2e1c9d to your computer and use it in GitHub Desktop.

Select an option

Save kemingy/c4a41054a8e5a2afdfc7effb1a2e1c9d to your computer and use it in GitHub Desktop.
decode sogou phrase
import os
import struct
from collections import namedtuple
Phrase = namedtuple('Phrase', ['word', 'pinyin', 'freq'])
OFFSET_PINYIN = 0x1540
OFFSET_HAN = 0x2628
class Sogou:
def __init__(self, file, single=False, scale=False):
if not os.path.isfile(file):
raise NameError("wrong file path")
self.file = file
self.pinyin = {}
self.words = []
def extract_data(self):
with open(self.file, 'rb') as f:
data = f.read()
if data[:12] != b'\x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00':
print('Wrong format data.')
return
print('词库名称:\t', self.byte2str(data[0x130:0x338]))
print('词库类型:\t', self.byte2str(data[0x338:0x540]))
print('描述信息:\t', self.byte2str(data[0x540:0xd40]))
print('词库示例:\t', self.byte2str(data[0xd40:OFFSET_PINYIN]))
self.get_pinyin(data[OFFSET_PINYIN:OFFSET_HAN])
self.get_han(data[OFFSET_HAN:])
def save(self, path):
with open(path, 'w', encoding='utf-8') as f:
for word in self.words:
f.write('{}\t{}\t{}\n', word.word, word.pinyin. word.freq)
def byte2str(self, text):
pos = 0
ans = ''
while pos + 1 < len(text):
num = struct.unpack('H', text[pos : pos + 2])[0]
pos += 2
if num == 0:
continue
char = chr(num)
ans += char
return ans
def get_pinyin(self, text):
if text[:4] != b'\x9D\x01\x00\x00':
return None
text = text[4:]
pos = 0
while pos < len(text):
index = self.unpack(text[pos:pos+2])
pos += 2
length = self.unpack(text[pos:pos+2])
pos += 2
py = self.byte2str(text[pos:pos+length])
pos += length
self.pinyin[index] = py
def get_word(self, text):
pos = 0
ans = ''
while pos + 1 < len(text):
index = self.unpack(text[pos:pos+2])
pos += 2
try:
ans += self.pinyin[index]
except KeyError as err:
print('Get an error: {}\nIgnore this and continue...'.format(err))
continue
return ans
def get_han(self, text):
pos = 0
while pos < len(text):
same = self.unpack(text[pos:pos+2])
pos += 2
pinyin_len = self.unpack(text[pos:pos+2])
pos += 2
pinyin = self.get_word(text[pos:pos+pinyin_len])
pos += pinyin_len
for _ in range(same):
han_len = self.unpack(text[pos:pos+2])
pos += 2
word = self.byte2str(text[pos:pos+han_len])
pos += han_len
ext_len = self.unpack(text[pos:pos+2])
pos += 2
# sogou phrase is tooooooooooooooooo bad...
freq = int(self.unpack(text[pos:pos+2])) / 3
pos += ext_len
if word:
self.words.append(Phrase(word, pinyin, freq))
for w in word:
if w in self.single_words:
self.single_words[w] = (self.single_words[w] + freq) / 2
else:
self.single_words[w] = freq
def unpack(self, text):
if len(text) != 2:
return 0
return struct.unpack('H', text)[0]
if __name__ == '__main__':
sogou = Sogou('Downloads/网络流行新词【官方推荐】.scel')
sogou.extract_data()
for w in sogou.words[:5]:
print(w)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment