-
-
Save jowagner/4473b604c2bdbbbbb6d9603ea061486f to your computer and use it in GitHub Desktop.
Load full English Wikipedia dataset in HuggingFace nlp library
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# based on https://gist.github.com/thomwolf/13ca2b2b172b2d17ac66685aa2eeba62 | |
# support for --len adapted from https://gist.github.com/lhoestq/8f317e47c6f8b6bc50ef1275f655a3a3 | |
# support for --count-spaces Joachim Wagner 2020-06-17 | |
# support for --read-first Joachim Wagner 2020-06-18 | |
# requirements: pip install nlp psutil six | |
import os; import psutil; import timeit | |
import sys | |
from nlp import load_dataset | |
mem_before = psutil.Process(os.getpid()).memory_info().rss >> 20 | |
wiki = load_dataset("wikipedia", "20200501.en", split='train') | |
mem_after = psutil.Process(os.getpid()).memory_info().rss >> 20 | |
print(f"RAM memory used: {(mem_after - mem_before)} MB") | |
s = """batch_size = 1000 | |
total_length = 0 | |
n_spaces = 0 | |
ord_sum = 0 | |
for i in range(0, len(wiki), batch_size): | |
batch = wiki[i:i + batch_size] | |
""" | |
if '--len' in sys.argv: | |
s += '\n total_length += sum(len(sample_text) for sample_text in batch["text"])' | |
print('getting length') | |
if '--count-spaces' in sys.argv: | |
s += '\n n_spaces += sum(sample_text.count(" ") for sample_text in batch["text"])' | |
print('counting spaces') | |
if '--read-first' in sys.argv: | |
s += """ for sample_text in batch["text"]: | |
if sample_text: | |
ord_sum += ord(sample_text[0]) | |
""" | |
print('reading first character') | |
time = timeit.timeit(stmt=s, number=1, globals=globals()) | |
size = wiki.dataset_size / 2**30 | |
print(f"Iterated over the {size:.1f} GB dataset in {time:.1f} s, i.e. {size * 8/time:.1f} Gbit/s") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment