Created
July 14, 2021 21:09
-
-
Save enijkamp/e4898037ef106672a3dbfe1239d0662b to your computer and use it in GitHub Desktop.
bpe_ratio.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import io | |
import tempfile | |
import tensorflow as tf | |
import transformers | |
def write_to_file(writer, data): | |
feature = { 'text': tf.train.Feature(int64_list=tf.train.Int64List(value=data)) } | |
tf_example = tf.train.Example(features=tf.train.Features(feature=feature)) | |
writer.write(tf_example.SerializeToString()) | |
def compression_ratio(n=int(2**12), compression=''): | |
data_unicode = 'EleutherAI is a decentralized grassroots collective of volunteer researchers, engineers, and developers focused on AI alignment, scaling, and open source AI research. Founded in July of 2020, our flagship project is the GPT-Neo family of models designed to replicate those developed by OpenAI as GPT-3. Our Discord server is open and welcomes contributors.' | |
tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2') | |
data_bpe = tokenizer.encode(data_unicode) | |
with io.BytesIO() as f: | |
for _ in range(n): | |
f.write(data_unicode.encode()) | |
n1 = f.getbuffer().nbytes | |
with tempfile.NamedTemporaryFile() as f: | |
with tf.io.TFRecordWriter(f.name, options=compression) as w: | |
for _ in range(n): | |
write_to_file(w, data_bpe) | |
n2 = os.path.getsize(f.name) | |
return float(n2) / float(n1) | |
if __name__ == '__main__': | |
print(compression_ratio(compression='')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment