Skip to content

Instantly share code, notes, and snippets.

@bartvm
Created February 27, 2017 18:46
Show Gist options
  • Save bartvm/f9f50af868dd51866d7e2c8d29de5c7a to your computer and use it in GitHub Desktop.
Save bartvm/f9f50af868dd51866d7e2c8d29de5c7a to your computer and use it in GitHub Desktop.
import re
import sys
from bitarray import bitarray
from daryq import daryq
from collections import Counter
with open(sys.argv[1]) as f:
text = f.read()
# Construct dictionary
tokens = Counter(c for c in text if c != '\n')
# Construct heap queue of pairs
pairs = Counter(text[i:i + 2] for i in range(len(text) - 1)
if '\n' not in text[i:i + 1])
pairs = daryq(pairs.items())
# Keep track of merges
merged = bitarray(len(text))
merged.setall(0)
# Five points define the split |a|b|c|d| which is merged into |a|bc|d|
def merge(i, j, k, l, m):
tokens[text[j:k]] -= 1
tokens[text[k:l]] -= 1
tokens[text[j:l]] += 1
if j > 0:
pairs[text[i:k]] -= 1
pairs[text[i:l]] += 1
if l < len(text):
pairs[text[k:m]] -= 1
pairs[text[j:m]] += 1
# Start the algorithm
while len(tokens) < 100:
# Select a pair to merge and find all instances
to_merge = pairs.pop()
expression = re.compile(to_merge)
for j in expression.finditer(text):
l = j + len(to_merge)
if (j == 0 or merged[j - 1] == 0) and merged[l - 1] == 0:
# We have found an instance of pair starting at i
assert merged[j:l].count(False) == 2
k = merged.index(False, j, l - 1) + 1
m = merged.index(False, l) + 1 if l < len(text) else -1
i = next(_ for _ in range(j - 2, 0, -1)
if not merged[_]) + 1 if j > 0 else -1
# Perform the merge
merge(i, j, k, l, m)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment