Skip to content

Instantly share code, notes, and snippets.

@viachaslavic
Last active March 25, 2025 13:07
Show Gist options
  • Save viachaslavic/a839fa35c860b1845591a39265eddc82 to your computer and use it in GitHub Desktop.
Save viachaslavic/a839fa35c860b1845591a39265eddc82 to your computer and use it in GitHub Desktop.
be-bel-audio-corpus dataset downloader
#!/usr/bin/env python
"""
Takes as input an argument with the starting page number (from 1 to 465). Without arguments downloads all from the first page.
"""
import os, sys
import urllib.request, json
n_pages = 465
def processing(start_page):
for i in range(start_page, n_pages):
print("Processing page", i + 1, ": from ", i * 100, "to", i * 100 + 99)
page = "https://datasets-server.huggingface.co/rows?dataset=fosters%2Fbe-bel-audio-corpus&config=default&split=train&offset=" + str(i*100) + "&length=100"
with urllib.request.urlopen(page) as url:
json_object = json.load(url)
for row in json_object['rows']:
savedir = row['row']['dataset'] + str("/") + row['row']['speaker_name']
os.makedirs(savedir, exist_ok=True)
wav_name = row['row']['file'].split('/')[1]
wav_path = savedir + str("/") + wav_name
txt_path = savedir + str("/") + wav_name.split('.')[0] + str('.txt')
txt_file = open(txt_path, "w")
txt_file.write(row['row']['text'] + '\n')
txt_file.close
for audio in row['row']['audio']:
urllib.request.urlretrieve(audio['src'], wav_path)
if len(sys.argv) == 2:
start_page = sys.argv[1]
else:
start_page = 1
processing(int(start_page) - 1)
#!/usr/bin/env python
"""
pythom -m venv extractor
source extractor/bin/activate
pip install datasets pandas pyarrow librosa soundfile
"""
import os, sys
from datasets import load_dataset
import soundfile as sf
dataset = load_dataset("fosters/be-bel-audio-corpus", split="train")
for sample in dataset:
savedir = sample['dataset'] + str("/") + sample['speaker_name']
os.makedirs(savedir, exist_ok=True)
audio = sample["audio"]
wav_path = savedir + str("/") + audio['path']
sf.write(wav_path, audio['array'], audio['sampling_rate'])
txt_path = savedir + str("/") + audio['path'].split('.')[0] + str('.txt')
txt_file = open(txt_path, "w")
txt_file.write(sample['text'] + '\n')
txt_file.close
print("Extracted: ", wav_path, "and", txt_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment