ordonezf · April 4, 2016 23:41
diff --git a/decoder.py b/decoder.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import sys
 import codecs
 READ_BYTES = 200

 def get_dic_from_file(f_name):
 	codes = range(1, 9) + [11, 12] + range(14, 32) + range(127, 256)
 	f = open("files/{name}".format(name=f_name),"r")
 	limit = chr(0) + chr(0)
 	read = f.read(READ_BYTES)
 	while limit not in read:
 		read = read + f.read(READ_BYTES)
 	f.close()
 	bytes_read = len(read[:read.index(limit)])
 	l = read[:bytes_read].split(chr(0))
 	dic_codes = {}
 	for word,code in zip(l,codes):
 		dic_codes[code] = word
 	return dic_codes, bytes_read + 2

 def decode_file(f_name, dic_codes, bytes_read):
 	ascii = ["encoded_pg84.txt","encoded_pg21279.txt"]
 	f = open("files/{name}".format(name=f_name),"rb")
 	if f_name not in ascii:
 		w = codecs.open("files/decoded_{name}".format(name=f_name),"w","utf-8-sig")
 	else:
 		w = open("files/decoded_{name}".format(name=f_name),"w")
 	f.seek(bytes_read)
 	while True:
 		string = ""
 		byte = f.read(1)
 		if not byte:
 			break
 		if ord(byte) == 0:
 			byte = f.read(1)
 			if not byte:
 				break
 			string = u""
 			string += unichr(ord(byte))
 			try:
 				w.write(string)
 			except:
 				w.write(string.encode(sys.stdout.encoding, errors="replace"))
 			continue
 		if ord(byte) in dic_codes:
 			string = str(dic_codes[ord(byte)])
 			try:
 				w.write(string)
 			except:
 				w.write(repr(dic_codes[ord(byte)]))
 		else:
 			w.write(chr(ord(byte)))
 	f.close()
 	w.close()

 def start_decoder():
 	names = ["encoded_pg10.txt", "encoded_pg84.txt", "encoded_pg100.txt",\
 	 		"encoded_pg1400.txt", "encoded_pg21279.txt", "encoded_pg22657.txt"]
 	for f_name in names:
 		print "Decoding file: {}...".format(f_name)
 		dic_codes, bytes_read = get_dic_from_file(f_name)
 		decode_file(f_name, dic_codes, bytes_read)
diff --git a/encoder.py b/encoder.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 TOP_WORDS = 158

 def most_frecuent_words(f_name, nwords):
 	f = open("files/{name}".format(name=f_name),"r")
 	dic = {}
 	for line in f:
 		for word in line.split():
 			dic.setdefault(word, 0)
 			dic[word] += 1
 	f.close()
 	l = dic.items()
 	l.sort(key=lambda x:x[1], reverse=True)

 	return [x[0] for x in l[:TOP_WORDS+1]]

 def match_frecuent_words(top_words):
 	codes = range(1, 9) + [11, 12] + range(14, 32) + range(127, 256)
 	dic = {}
 	for word,code in zip(top_words, codes):
 		dic[word] = code
 	return dic

 def encode_file(f_name, dic_codes):
 	ascii = ["pg84.txt","pg21279.txt"]
 	f = open("files/{name}".format(name=f_name),"r")
 	w = open("files/encoded_{name}".format(name=f_name),"wb")
 	codes = range(1, 9) + [11, 12] + range(14, 32) + range(127, 256)
 	l = dic_codes.items()
 	l.sort(key=lambda x:x[1])
 	string = chr(0).join([x[0] for x in l])
 	string = string + chr(0) + chr(0)
 	string = [ord(x) for x in string]
 	w.write(bytearray(string))
 	i = 0
 	for line in f:
 		i += 1
 		line_with_spaces = []
 		for x in line:
 			if x == " ":
 				line_with_spaces.append(" ")
 				line_with_spaces.append("SPACE_HERE")
 			line_with_spaces.append(x)
 		line_with_spaces = "".join(line_with_spaces)
 		if f_name not in ascii:
 			line_with_spaces = line_with_spaces.decode("utf-8-sig").encode("utf-8")
 		for word in line_with_spaces.split():
 			if word == "SPACE_HERE":
 				w.write(bytearray([32]))
 				continue
 			if word in dic_codes:
 				w.write(bytearray([dic_codes[word]]))
 			else:
 				word_in_ascii = [ord(x) for x in word]
 				word_in_ascii_special = []
 				for let in word_in_ascii:
 					if let in codes:
 						word_in_ascii_special.append(chr(0))
 					word_in_ascii_special.append(let)
 				w.write(bytearray(word_in_ascii_special))
 		if i == 18 and f_name == 'pg84.txt':
 			w.write(bytearray('\n'))
 			continue
 		if i == 17 and f_name == 'pg21279.txt':
 			w.write(bytearray('\n'))
 			continue
 		w.write(bytearray('\r'))
 		w.write(bytearray('\n'))
 	f.close()
 	w.close()

 def start_encoder():
 	names = ["pg10.txt", "pg84.txt", "pg100.txt", "pg1400.txt",\
 			"pg21279.txt", "pg22657.txt"]
 	for f_name in names:
 		print "Encoding file: {}...".format(f_name)
 		top_words = most_frecuent_words(f_name, TOP_WORDS)
 		dic_codes = match_frecuent_words(top_words)
 		encode_file(f_name, dic_codes)
diff --git a/finger5.py b/finger5.py
 from encoder import start_encoder
 from decoder import start_decoder
 def main():
 	start_encoder()
 	start_decoder()
 main()
	#!/usr/bin/env python
	# -- coding: utf-8 --
	import sys
	import codecs
	READ_BYTES = 200

	def get_dic_from_file(f_name):
	codes = range(1, 9) + [11, 12] + range(14, 32) + range(127, 256)
	f = open("files/{name}".format(name=f_name),"r")
	limit = chr(0) + chr(0)
	read = f.read(READ_BYTES)
	while limit not in read:
	read = read + f.read(READ_BYTES)
	f.close()
	bytes_read = len(read[:read.index(limit)])
	l = read[:bytes_read].split(chr(0))
	dic_codes = {}
	for word,code in zip(l,codes):
	dic_codes[code] = word
	return dic_codes, bytes_read + 2

	def decode_file(f_name, dic_codes, bytes_read):
	ascii = ["encoded_pg84.txt","encoded_pg21279.txt"]
	f = open("files/{name}".format(name=f_name),"rb")
	if f_name not in ascii:
	w = codecs.open("files/decoded_{name}".format(name=f_name),"w","utf-8-sig")
	else:
	w = open("files/decoded_{name}".format(name=f_name),"w")
	f.seek(bytes_read)
	while True:
	string = ""
	byte = f.read(1)
	if not byte:
	break
	if ord(byte) == 0:
	byte = f.read(1)
	if not byte:
	break
	string = u""
	string += unichr(ord(byte))
	try:
	w.write(string)
	except:
	w.write(string.encode(sys.stdout.encoding, errors="replace"))
	continue
	if ord(byte) in dic_codes:
	string = str(dic_codes[ord(byte)])
	try:
	w.write(string)
	except:
	w.write(repr(dic_codes[ord(byte)]))
	else:
	w.write(chr(ord(byte)))
	f.close()
	w.close()

	def start_decoder():
	names = ["encoded_pg10.txt", "encoded_pg84.txt", "encoded_pg100.txt",\
	"encoded_pg1400.txt", "encoded_pg21279.txt", "encoded_pg22657.txt"]
	for f_name in names:
	print "Decoding file: {}...".format(f_name)
	dic_codes, bytes_read = get_dic_from_file(f_name)
	decode_file(f_name, dic_codes, bytes_read)
	#!/usr/bin/env python
	# -- coding: utf-8 --
	TOP_WORDS = 158

	def most_frecuent_words(f_name, nwords):
	f = open("files/{name}".format(name=f_name),"r")
	dic = {}
	for line in f:
	for word in line.split():
	dic.setdefault(word, 0)
	dic[word] += 1
	f.close()
	l = dic.items()
	l.sort(key=lambda x:x[1], reverse=True)

	return [x[0] for x in l[:TOP_WORDS+1]]

	def match_frecuent_words(top_words):
	codes = range(1, 9) + [11, 12] + range(14, 32) + range(127, 256)
	dic = {}
	for word,code in zip(top_words, codes):
	dic[word] = code
	return dic

	def encode_file(f_name, dic_codes):
	ascii = ["pg84.txt","pg21279.txt"]
	f = open("files/{name}".format(name=f_name),"r")
	w = open("files/encoded_{name}".format(name=f_name),"wb")
	codes = range(1, 9) + [11, 12] + range(14, 32) + range(127, 256)
	l = dic_codes.items()
	l.sort(key=lambda x:x[1])
	string = chr(0).join([x[0] for x in l])
	string = string + chr(0) + chr(0)
	string = [ord(x) for x in string]
	w.write(bytearray(string))
	i = 0
	for line in f:
	i += 1
	line_with_spaces = []
	for x in line:
	if x == " ":
	line_with_spaces.append(" ")
	line_with_spaces.append("SPACE_HERE")
	line_with_spaces.append(x)
	line_with_spaces = "".join(line_with_spaces)
	if f_name not in ascii:
	line_with_spaces = line_with_spaces.decode("utf-8-sig").encode("utf-8")
	for word in line_with_spaces.split():
	if word == "SPACE_HERE":
	w.write(bytearray([32]))
	continue
	if word in dic_codes:
	w.write(bytearray([dic_codes[word]]))
	else:
	word_in_ascii = [ord(x) for x in word]
	word_in_ascii_special = []
	for let in word_in_ascii:
	if let in codes:
	word_in_ascii_special.append(chr(0))
	word_in_ascii_special.append(let)
	w.write(bytearray(word_in_ascii_special))
	if i == 18 and f_name == 'pg84.txt':
	w.write(bytearray('\n'))
	continue
	if i == 17 and f_name == 'pg21279.txt':
	w.write(bytearray('\n'))
	continue
	w.write(bytearray('\r'))
	w.write(bytearray('\n'))
	f.close()
	w.close()

	def start_encoder():
	names = ["pg10.txt", "pg84.txt", "pg100.txt", "pg1400.txt",\
	"pg21279.txt", "pg22657.txt"]
	for f_name in names:
	print "Encoding file: {}...".format(f_name)
	top_words = most_frecuent_words(f_name, TOP_WORDS)
	dic_codes = match_frecuent_words(top_words)
	encode_file(f_name, dic_codes)
	from encoder import start_encoder
	from decoder import start_decoder
	def main():
	start_encoder()
	start_decoder()
	main()