Skip to content

Instantly share code, notes, and snippets.

@alumae
Created September 6, 2023 13:28
Show Gist options
  • Save alumae/d0664e995e179bbb02e0e86787f4c80b to your computer and use it in GitHub Desktop.
Save alumae/d0664e995e179bbb02e0e86787f4c80b to your computer and use it in GitHub Desktop.
import sys
import logging
import argparse
import os
import glob
from pathlib import Path
import random
import zipfile
import io
import soundfile as sf
from typing import Any, Dict, List, Optional, Union
from tqdm import tqdm
#from examples.speech_to_text.data_utils import is_sf_audio_data, is_npy_data
def get_zip_manifest(
zip_path: Path, zip_root: Optional[Path] = None, is_audio=False
):
_zip_path = Path.joinpath(zip_root or Path(""), zip_path)
with zipfile.ZipFile(_zip_path, mode="r") as f:
info = f.infolist()
paths, lengths = {}, {}
for i in tqdm(info):
utt_id = Path(i.filename)
#breakpoint()
utt_id = str(utt_id)[:-len(utt_id.suffix)]
offset, file_size = i.header_offset + 30 + len(i.filename), i.file_size
with open(_zip_path, "rb") as f:
f.seek(offset)
byte_data = f.read(file_size)
if len(byte_data) <= 1:
continue
paths[utt_id] = f"{zip_path.as_posix()}:{offset}:{file_size}"
#if is_audio:
# assert is_sf_audio_data(byte_data), i
#else:
# assert is_npy_data(byte_data), i
byte_data_fp = io.BytesIO(byte_data)
if is_audio:
lengths[utt_id] = sf.info(byte_data_fp).frames
else:
lengths[utt_id] = np.load(byte_data_fp).shape[0]
return paths, lengths
if __name__ == '__main__':
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
parser = argparse.ArgumentParser()
parser.add_argument("out_tsv")
parser.add_argument("zips", nargs='+')
args = parser.parse_args()
with open(args.out_tsv, "w") as f_out:
print(".", file=f_out)
for filename in args.zips:
audio_paths, audio_lengths = get_zip_manifest(Path(filename), is_audio=True)
for audio in audio_paths.keys():
print(f"{audio_paths[audio]}\t{audio_lengths[audio]}", file=f_out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment