dmarx · September 9, 2012 15:51
diff --git a/IndexAndCompress.py b/IndexAndCompress.py
 _text = 'raw.txt'
 _index = 'index.txt'

 def readFile(F):
    '''Pretty generic 'read' operation'''
    data = []
    with open(F, 'r') as f:
        for line in f:
            data.append(line.strip())
    return data

 def update_index(ind, F, txt):
    '''Updates word index in memory and hard file with new words in text.'''
    words = set()
    for row in txt:
        words.update(row.split())
    words.difference_update(index)
    if words:
        with open(F,'ab') as f:
            for w in words:
                f.write(w+'\n')
                ind.append(w)
    return ind
                
 index = readFile(_index)
 text = readFile(_text)
 index = update_index(index, _index, text)

 for row in text:
    tokens = row.split()
    compressed = [chr(i+200) for word in tokens for i,w in enumerate(index) if w == word]
    print compressed
	_text = 'raw.txt'
	_index = 'index.txt'

	def readFile(F):
	'''Pretty generic 'read' operation'''
	data = []
	with open(F, 'r') as f:
	for line in f:
	data.append(line.strip())
	return data

	def update_index(ind, F, txt):
	'''Updates word index in memory and hard file with new words in text.'''
	words = set()
	for row in txt:
	words.update(row.split())
	words.difference_update(index)
	if words:
	with open(F,'ab') as f:
	for w in words:
	f.write(w+'\n')
	ind.append(w)
	return ind

	index = readFile(_index)
	text = readFile(_text)
	index = update_index(index, _index, text)

	for row in text:
	tokens = row.split()
	compressed = [chr(i+200) for word in tokens for i,w in enumerate(index) if w == word]
	print compressed