Created
December 4, 2018 01:20
-
-
Save ven-kyoshiro/9980ef8c4f949a0b43d8e1bc0a93c59b to your computer and use it in GitHub Desktop.
tunstall.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import os | |
import pandas as pd | |
import collections | |
import pickle | |
def notify(message = 'done'): | |
pass | |
def tunstall_code(seq,n): | |
dict_size = 2**n | |
N = len(seq) - seq.count('-') | |
leaf = [[N,'']] | |
all_chr = collections.Counter(seq).keys()-set('-') | |
old_leaf = [] | |
while True: | |
s_leaf = sorted(leaf,reverse=True) | |
for c in all_chr: | |
count = seq.count(s_leaf[0][1]+c) | |
if count: | |
s_leaf.append([count,s_leaf[0][1]+c]) | |
if len(leaf)+len(old_leaf) > dict_size: | |
break | |
old_leaf.append(s_leaf[0]) | |
leaf = s_leaf[1:] | |
leaf = leaf + old_leaf[1:] | |
return sorted([[len(leaf[i][1]),leaf[i][0],leaf[i][1]] for i in range(len(leaf))],reverse=True) | |
def main(): | |
df = pd.read_csv('dataset.csv', index_col=0) | |
seq1 = df.array.values.tolist() | |
record = {'j':[],'enc':[],'codebook':[]} | |
for j in [5,6,7,8,9,10,11,12,13,14,15,16,17,18]: | |
seqx = '-'.join(seq1) | |
codebook = tunstall_code(seqx,j) | |
decodebook = {} | |
for i,c in enumerate(codebook): | |
seqx = seqx.replace(c[2],chr(12354+i)) | |
decodebook[chr(12354+i)]=c[2] | |
record['j'].append(j) | |
record['enc'].append(len(seqx)) | |
record['codebook'].append(codebook) | |
notify(str(record['j'])+' is done. \n score:'+str(len(seqx))) | |
with open('vf_record.pickle',mode='wb') as f: | |
pickle.dump(record,f) | |
if __name__ == '__main__': | |
try: | |
main() | |
except Exception as e: | |
notify('!!!! Error !!!!\n'+str(e)+'\n') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment