Skip to content

Instantly share code, notes, and snippets.

@macleginn
Created November 15, 2024 10:33
Show Gist options
  • Save macleginn/9f0710102be41a9b2d208890e4fe2190 to your computer and use it in GitHub Desktop.
Save macleginn/9f0710102be41a9b2d208890e4fe2190 to your computer and use it in GitHub Desktop.
BPE without word separation
import urllib.request
from collections import defaultdict
raw_bytes = urllib.request.urlopen(
'http://www.sls.hawaii.edu/bley-vroman/brown.txt')
brown_corpus = raw_bytes.read().decode('utf8').replace('\r\n', '\n')
B = brown_corpus[:250]
def find_pairs(text):
pairs = defaultdict(int)
for w in range(len(text)-1):
pairs[(text[w], text[w+1])] += 1
return pairs
def merge(text, max_pair):
new_pairs = []
w = 0
while w < len(text)-1:
if (text[w], text[w+1]) == max_pair:
new_pairs.append(text[w] + text[w+1])
w += 2
else:
new_pairs.append(text[w])
w += 1
return new_pairs
def BPE(text, num_iter):
for n in range(num_iter):
pairs = find_pairs(text)
max_pair = max(pairs, key=pairs.get)
if pairs[max_pair] == 1:
break # no more merges possilbe
text = merge(text, max_pair)
return text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment