Created
January 7, 2022 14:48
-
-
Save DevloperHS/a461c643c1bcfd3b2be58a94486d8ff4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def main(): | |
| data_root = args.data_root | |
| os.makedirs(data_root, exist_ok=True) | |
| target_unpacked_dir = os.path.join(data_root, "CV_unpacked") | |
| if os.path.exists(target_unpacked_dir): | |
| logging.info('Find existing folder {}'.format(target_unpacked_dir)) | |
| else: | |
| logging.info("Could not find Common Voice, Downloading corpus...") | |
| filename = wget.download(COMMON_VOICE_URL, data_root) | |
| target_file = os.path.join(data_root, os.path.basename(filename)) | |
| os.makedirs(target_unpacked_dir, exist_ok=True) | |
| logging.info("Unpacking corpus to {} ...".format(target_unpacked_dir)) | |
| tar = tarfile.open(target_file) | |
| tar.extractall(target_unpacked_dir) | |
| tar.close() | |
| folder_path = os.path.join(target_unpacked_dir, args.version + f'/{args.language}/') | |
| for csv_file in args.files_to_process: | |
| data = process_files( | |
| csv_file=os.path.join(folder_path, csv_file), | |
| data_root=os.path.join(data_root, os.path.splitext(csv_file)[0]), | |
| num_workers=args.num_workers | |
| ) | |
| logging.info('Creating manifests...') | |
| create_manifest( | |
| data=data, | |
| output_name=f'commonvoice_{os.path.splitext(csv_file)[0]}_manifest.json', | |
| manifest_path=args.manifest_dir, | |
| ) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment