viachaslavic · March 25, 2025 13:07
diff --git a/be-bel-audio-corpus_downloader.py b/be-bel-audio-corpus_downloader.py
 #!/usr/bin/env python

 """
 Takes as input an argument with the starting page number (from 1 to 465). Without arguments downloads all from the first page.
 """

 import os, sys
 import urllib.request, json

 n_pages = 465

 def processing(start_page):
    for i in range(start_page, n_pages):
        print("Processing page", i + 1, ": from ", i * 100, "to", i * 100 + 99)
        page = "https://datasets-server.huggingface.co/rows?dataset=fosters%2Fbe-bel-audio-corpus&config=default&split=train&offset=" + str(i*100) + "&length=100"
        with urllib.request.urlopen(page) as url:
            json_object = json.load(url)
            for row in json_object['rows']:
                savedir = row['row']['dataset'] + str("/") + row['row']['speaker_name']
                os.makedirs(savedir, exist_ok=True)

                wav_name = row['row']['file'].split('/')[1]
                wav_path = savedir + str("/") + wav_name
                txt_path = savedir + str("/") + wav_name.split('.')[0] + str('.txt')
                txt_file = open(txt_path, "w")
                txt_file.write(row['row']['text'] + '\n')
                txt_file.close
                for audio in row['row']['audio']:
                    urllib.request.urlretrieve(audio['src'], wav_path)

 if len(sys.argv) == 2:
    start_page = sys.argv[1]
 else:
    start_page = 1

 processing(int(start_page) - 1)
diff --git a/be-bel-audio-corpus_extractor.py b/be-bel-audio-corpus_extractor.py
 #!/usr/bin/env python

 """
 pythom -m venv extractor
 source extractor/bin/activate
 pip install datasets pandas pyarrow librosa soundfile
 """

 import os, sys
 from datasets import load_dataset
 import soundfile as sf


 dataset = load_dataset("fosters/be-bel-audio-corpus", split="train")

 for sample in dataset:
    savedir = sample['dataset'] + str("/") + sample['speaker_name']
    os.makedirs(savedir, exist_ok=True)

    audio = sample["audio"]
    wav_path = savedir + str("/") + audio['path'] 
    sf.write(wav_path, audio['array'], audio['sampling_rate'])

    txt_path = savedir + str("/") +  audio['path'].split('.')[0] + str('.txt')
    txt_file = open(txt_path, "w")
    txt_file.write(sample['text'] + '\n')
    txt_file.close

    print("Extracted: ", wav_path, "and", txt_path)
	#!/usr/bin/env python

	"""
	Takes as input an argument with the starting page number (from 1 to 465). Without arguments downloads all from the first page.
	"""

	import os, sys
	import urllib.request, json

	n_pages = 465

	def processing(start_page):
	for i in range(start_page, n_pages):
	print("Processing page", i + 1, ": from ", i * 100, "to", i * 100 + 99)
	page = "https://datasets-server.huggingface.co/rows?dataset=fosters%2Fbe-bel-audio-corpus&config=default&split=train&offset=" + str(i*100) + "&length=100"
	with urllib.request.urlopen(page) as url:
	json_object = json.load(url)
	for row in json_object['rows']:
	savedir = row['row']['dataset'] + str("/") + row['row']['speaker_name']
	os.makedirs(savedir, exist_ok=True)

	wav_name = row['row']['file'].split('/')[1]
	wav_path = savedir + str("/") + wav_name
	txt_path = savedir + str("/") + wav_name.split('.')[0] + str('.txt')
	txt_file = open(txt_path, "w")
	txt_file.write(row['row']['text'] + '\n')
	txt_file.close
	for audio in row['row']['audio']:
	urllib.request.urlretrieve(audio['src'], wav_path)

	if len(sys.argv) == 2:
	start_page = sys.argv[1]
	else:
	start_page = 1

	processing(int(start_page) - 1)
	#!/usr/bin/env python

	"""
	pythom -m venv extractor
	source extractor/bin/activate
	pip install datasets pandas pyarrow librosa soundfile
	"""

	import os, sys
	from datasets import load_dataset
	import soundfile as sf


	dataset = load_dataset("fosters/be-bel-audio-corpus", split="train")

	for sample in dataset:
	savedir = sample['dataset'] + str("/") + sample['speaker_name']
	os.makedirs(savedir, exist_ok=True)

	audio = sample["audio"]
	wav_path = savedir + str("/") + audio['path']
	sf.write(wav_path, audio['array'], audio['sampling_rate'])

	txt_path = savedir + str("/") + audio['path'].split('.')[0] + str('.txt')
	txt_file = open(txt_path, "w")
	txt_file.write(sample['text'] + '\n')
	txt_file.close

	print("Extracted: ", wav_path, "and", txt_path)