This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from dataclasses import dataclass, field | |
import torch | |
from torch import LongTensor, Tensor | |
from transformers import ( | |
AutoTokenizer, | |
AutoModel, | |
PreTrainedModel, | |
PreTrainedTokenizer, | |
BatchEncoding, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gc | |
import numpy as np | |
import time | |
import pandas as pd | |
from tqdm import tqdm | |
def pack_documents_original(tokenized_documents, block_size: int = 8192, use_tqdm=True): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding=utf-8 | |
# Copyright 2020 The HuggingFace Inc. team. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import importlib | |
from dataclasses import dataclass, field | |
from pathlib import Path | |
from typing import Optional | |
from transformers import HfArgumentParser, AutoConfig, AutoTokenizer | |
@dataclass | |
class ScriptArguments: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# If there is an error in nvidia-smi, log it to a file in ~/gpu-errors! | |
nvidia_smi_output=$(nvidia-smi) | |
if echo "nvidia_smi_output" | grep -q "ERR"; then | |
fname=~/gpu-errors/$(hostname)-error.txt | |
pdir=$(dirname "$fname") | |
mkdir -p "$pdir" | |
nvcc_output=$(nvcc --version) | |
echo "$nvidia_smi_output"$'\n'"$nvcc_version_output" > "$fname" | |
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def set_seed(seed: Optional[int]): | |
if seed is not None: | |
torch.manual_seed(seed) | |
torch.cuda.manual_seed_all(seed) | |
torch.backends.cudnn.deterministic = True | |
torch.backends.cudnn.benchmark = False | |
np.random.seed(seed) | |
random.seed(seed) | |
os.environ["PYTHONHASHSEED"] = str(seed) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# See https://gist.github.com/BramVanroy/f78530673b1437ed0d6be7c61cdbdd7c | |
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, HyperOptArguments)) | |
try: | |
# Assumes that the first .json file is the config file (if any) | |
config_file = next(iter(arg for arg in sys.argv if arg.endswith(".json"))) | |
except StopIteration: | |
config_file = None | |
run_name_specified = False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# If we open a session/job that's on a host that starts with gpu* (e.g. gpu512.dodrio.os), | |
# load PyTorch with CUDA and pdsh | |
# This makes sure that deepspeed/pdsh work in multi node settings | |
if [[ $(hostname) == gpu* ]]; then | |
module load PyTorch/1.12.0-foss-2022a-CUDA-11.7.0; | |
module load pdsh/2.34-GCCcore-11.3.0; | |
fi | |
# Automatically generates a hostfile for the current job in the current directory, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import psutil | |
from pynvml import nvmlDeviceGetCount, nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo | |
def format_bytes(nbytes: int) -> str: | |
if nbytes == 0: | |
return "0 B" | |
unit = ("B", "kB", "MB", "GB", "TB") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoTokenizer | |
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") | |
text = "It 's a pre-tokenized , silly sentence !" | |
words = text.split() | |
encoded = tokenizer(words, is_split_into_words=True) | |
for token, wordid in zip(encoded.tokens(), encoded.word_ids()): | |
if wordid is not None: | |
print(token, words[wordid]) |
NewerOlder