-
-
Save sagorbrur/7edbeca06310a19bbf75048067a7068b to your computer and use it in GitHub Desktop.
Script for downloading data of the GLUE benchmark (gluebenchmark.com)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Script for downloading all GLUE data. | |
| Modified by: Sagor Sarker | |
| Dependency: | |
| pip install wget | |
| pip install wasabi | |
| """ | |
| import os | |
| import wget | |
| from wasabi import msg | |
| from zipfile import ZipFile | |
| def main(): | |
| TASK2PATH = { | |
| "CoLA":'https://dl.fbaipublicfiles.com/glue/data/CoLA.zip', | |
| "SST":'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip', | |
| "QQP":'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip', | |
| "STS":'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip', | |
| "MNLI":'https://dl.fbaipublicfiles.com/glue/data/MNLI.zip', | |
| "QNLI":'https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip', | |
| "RTE":'https://dl.fbaipublicfiles.com/glue/data/RTE.zip', | |
| "WNLI":'https://dl.fbaipublicfiles.com/glue/data/WNLI.zip', | |
| "diagnostic":'https://dl.fbaipublicfiles.com/glue/data/AX.tsv' | |
| } | |
| # check and create glue_data directory | |
| download_path = "glue_data" | |
| os.makedirs(download_path, exist_ok=True) | |
| # download MRPC | |
| try: | |
| mrpc_path = os.path.join(download_path, 'MRPC') | |
| os.makedirs(mrpc_path, exist_ok=True) | |
| msg.info(f"Downloading MRPC data") | |
| MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt' | |
| MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt' | |
| wget.download(MRPC_TRAIN, mrpc_path) | |
| wget.download(MRPC_TEST, mrpc_path) | |
| msg.good(f"Download Completed") | |
| except Exception as e: | |
| print(e) | |
| # downloading other task data | |
| for task_name, data_link in TASK2PATH.items(): | |
| try: | |
| data_name = data_link.split('/')[-1] | |
| msg.info(f"Downloading {task_name}") | |
| wget.download(data_link, download_path) | |
| msg.good(f"\nDownload completed") | |
| if '.zip' in data_name: | |
| msg.info(f"Extracting {task_name}") | |
| task_file_path = download_path + "/" + data_name | |
| with ZipFile(task_file_path) as zf: | |
| zf.extractall(download_path) | |
| msg.good(f"Extraction completed") | |
| os.remove(task_file_path) | |
| except Exception as e: | |
| print(e) | |
| if __name__=="__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment