Skip to content

Instantly share code, notes, and snippets.

@DevloperHS
Created January 7, 2022 14:48
Show Gist options
  • Select an option

  • Save DevloperHS/a461c643c1bcfd3b2be58a94486d8ff4 to your computer and use it in GitHub Desktop.

Select an option

Save DevloperHS/a461c643c1bcfd3b2be58a94486d8ff4 to your computer and use it in GitHub Desktop.
def main():
data_root = args.data_root
os.makedirs(data_root, exist_ok=True)
target_unpacked_dir = os.path.join(data_root, "CV_unpacked")
if os.path.exists(target_unpacked_dir):
logging.info('Find existing folder {}'.format(target_unpacked_dir))
else:
logging.info("Could not find Common Voice, Downloading corpus...")
filename = wget.download(COMMON_VOICE_URL, data_root)
target_file = os.path.join(data_root, os.path.basename(filename))
os.makedirs(target_unpacked_dir, exist_ok=True)
logging.info("Unpacking corpus to {} ...".format(target_unpacked_dir))
tar = tarfile.open(target_file)
tar.extractall(target_unpacked_dir)
tar.close()
folder_path = os.path.join(target_unpacked_dir, args.version + f'/{args.language}/')
for csv_file in args.files_to_process:
data = process_files(
csv_file=os.path.join(folder_path, csv_file),
data_root=os.path.join(data_root, os.path.splitext(csv_file)[0]),
num_workers=args.num_workers
)
logging.info('Creating manifests...')
create_manifest(
data=data,
output_name=f'commonvoice_{os.path.splitext(csv_file)[0]}_manifest.json',
manifest_path=args.manifest_dir,
)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment