Created
February 13, 2023 16:13
-
-
Save Vaibhavs10/eac93eee5b9beec6f11db7ee50eb72ea to your computer and use it in GitHub Desktop.
How to use Common Voice 11 with 🤗Datasets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load the dataset (locally) | |
from datasets import load_dataset | |
cv_11 = load_dataset("mozilla-foundation/common_voice_11_0", "hi", split="train") | |
# Stream the dataset | |
from datasets import load_dataset | |
cv_11 = load_dataset("mozilla-foundation/common_voice_11_0", "hi", split="train", streaming=True) | |
print(next(iter(cv_11))) | |
# Create a PyTorch dataloader | |
from datasets import load_dataset | |
from torch.utils.data.sampler import BatchSampler, RandomSampler | |
cv_11 = load_dataset("mozilla-foundation/common_voice_11_0", "hi", split="train") | |
batch_sampler = BatchSampler(RandomSampler(cv_11), batch_size=32, drop_last=False) | |
dataloader = DataLoader(cv_11, batch_sampler=batch_sampler) | |
# Create a streaming PyTorch dataloader | |
from datasets import load_dataset | |
from torch.utils.data import DataLoader | |
cv_11 = load_dataset("mozilla-foundation/common_voice_11_0", "hi", split="train") | |
dataloader = DataLoader(cv_11, batch_size=32) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment