Skip to content

Instantly share code, notes, and snippets.

@Vaibhavs10
Created February 13, 2023 16:13
Show Gist options
  • Save Vaibhavs10/eac93eee5b9beec6f11db7ee50eb72ea to your computer and use it in GitHub Desktop.
Save Vaibhavs10/eac93eee5b9beec6f11db7ee50eb72ea to your computer and use it in GitHub Desktop.
How to use Common Voice 11 with 🤗Datasets
# Load the dataset (locally)
from datasets import load_dataset
cv_11 = load_dataset("mozilla-foundation/common_voice_11_0", "hi", split="train")
# Stream the dataset
from datasets import load_dataset
cv_11 = load_dataset("mozilla-foundation/common_voice_11_0", "hi", split="train", streaming=True)
print(next(iter(cv_11)))
# Create a PyTorch dataloader
from datasets import load_dataset
from torch.utils.data.sampler import BatchSampler, RandomSampler
cv_11 = load_dataset("mozilla-foundation/common_voice_11_0", "hi", split="train")
batch_sampler = BatchSampler(RandomSampler(cv_11), batch_size=32, drop_last=False)
dataloader = DataLoader(cv_11, batch_sampler=batch_sampler)
# Create a streaming PyTorch dataloader
from datasets import load_dataset
from torch.utils.data import DataLoader
cv_11 = load_dataset("mozilla-foundation/common_voice_11_0", "hi", split="train")
dataloader = DataLoader(cv_11, batch_size=32)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment