Skip to content

Instantly share code, notes, and snippets.

@jowagner
Forked from thomwolf/loading_wikipedia.py
Last active June 18, 2020 14:10
Show Gist options
  • Save jowagner/4473b604c2bdbbbbb6d9603ea061486f to your computer and use it in GitHub Desktop.
Save jowagner/4473b604c2bdbbbbb6d9603ea061486f to your computer and use it in GitHub Desktop.
Load full English Wikipedia dataset in HuggingFace nlp library
#!/usr/bin/env python
# based on https://gist.github.com/thomwolf/13ca2b2b172b2d17ac66685aa2eeba62
# support for --len adapted from https://gist.github.com/lhoestq/8f317e47c6f8b6bc50ef1275f655a3a3
# support for --count-spaces Joachim Wagner 2020-06-17
# support for --read-first Joachim Wagner 2020-06-18
# requirements: pip install nlp psutil six
import os; import psutil; import timeit
import sys
from nlp import load_dataset
mem_before = psutil.Process(os.getpid()).memory_info().rss >> 20
wiki = load_dataset("wikipedia", "20200501.en", split='train')
mem_after = psutil.Process(os.getpid()).memory_info().rss >> 20
print(f"RAM memory used: {(mem_after - mem_before)} MB")
s = """batch_size = 1000
total_length = 0
n_spaces = 0
ord_sum = 0
for i in range(0, len(wiki), batch_size):
batch = wiki[i:i + batch_size]
"""
if '--len' in sys.argv:
s += '\n total_length += sum(len(sample_text) for sample_text in batch["text"])'
print('getting length')
if '--count-spaces' in sys.argv:
s += '\n n_spaces += sum(sample_text.count(" ") for sample_text in batch["text"])'
print('counting spaces')
if '--read-first' in sys.argv:
s += """ for sample_text in batch["text"]:
if sample_text:
ord_sum += ord(sample_text[0])
"""
print('reading first character')
time = timeit.timeit(stmt=s, number=1, globals=globals())
size = wiki.dataset_size / 2**30
print(f"Iterated over the {size:.1f} GB dataset in {time:.1f} s, i.e. {size * 8/time:.1f} Gbit/s")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment