Skip to content

Instantly share code, notes, and snippets.

@fsndzomga
Created September 18, 2023 22:20
Show Gist options
  • Save fsndzomga/ae8f28f89b7c59f2b17c27c2b9f1b321 to your computer and use it in GitHub Desktop.
Save fsndzomga/ae8f28f89b7c59f2b17c27c2b9f1b321 to your computer and use it in GitHub Desktop.
Byte Pair Encoding Algorithm
from collections import Counter, defaultdict
import re
def get_stats(vocab):
pairs = Counter()
for word, freq in vocab.items():
symbols = word.split()
for i in range(len(symbols) - 1):
pairs[symbols[i], symbols[i + 1]] += freq
return pairs
def merge_vocab(pair, vocab):
new_vocab = defaultdict(int)
bigram = ' '.join(pair)
replacement = ''.join(pair)
pattern = re.escape(bigram)
for word in vocab:
new_word = re.sub(pattern, replacement, word)
new_vocab[new_word] = vocab[word]
return new_vocab
def get_vocab(text):
# Get initial vocabulary from text (with frequency count)
vocab = Counter(text.split())
return {' '.join(word): freq for word, freq in vocab.items()}
def bpe(text, num_merges=10):
vocab = get_vocab(text)
for i in range(num_merges):
pairs = get_stats(vocab)
if not pairs:
break
best_pair = max(pairs, key=pairs.get)
vocab = merge_vocab(best_pair, vocab)
return vocab
# Example text corpus
text_corpus = "put your corpus here"
# Number of merge operations
num_merges = 10
# Apply BPE algorithm
result_vocab = bpe(text_corpus, num_merges=num_merges)
result_vocab
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment