- Update HISTORY.rst
- Commit the changes:
git add HISTORY.rst
git commit -m "Changelog for upcoming release 0.1.1."
- Update version number (can also be
major
(x.0.0
),minor
(0.x.0
) instead ofpatch
(0.0.x
)
bumpversion patch
git add HISTORY.rst
git commit -m "Changelog for upcoming release 0.1.1."
major
(x.0.0
), minor
(0.x.0
) instead of patch
(0.0.x
)bumpversion patch
Oftentimes, you may want to use a web-based tool during programming, e.g. a Jupyter notebook, Tensorboard, Streamlit, and others. It is easy to set these tools up locally, on your own machine, but this computer may not be as powerful as a server that you have available. Here is a small guide to show you how to easily use the web-based tool remotely. As an example, we will use Tensorboard, allowing us to remotely monitor the live-updated progress of our machine learning system during training. This gist is simply an extension of the following Stack Overflow post. This gist does not cover how to use Tensorboard itself. To get started with that, read through the documentation (works for Tensorflow as well as PyTorch).
If we would start Tensorboard on our own machine, it would create a local server that is accessible through a s
from typing import List | |
import spacy | |
from spacy import Language, Vocab | |
from spacy.tokens import Doc | |
def load_nlp(model_name: str = "en_core_web_sm", | |
is_tokenized: bool = False, | |
exclude: List[str] = None): | |
"""Load a spaCy model. Disable sentence segmentation and tokenization with is_tokenized. |
from transformers import AutoTokenizer | |
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") | |
text = "It 's a pre-tokenized , silly sentence !" | |
words = text.split() | |
encoded = tokenizer(words, is_split_into_words=True) | |
for token, wordid in zip(encoded.tokens(), encoded.word_ids()): | |
if wordid is not None: | |
print(token, words[wordid]) |
import math | |
import psutil | |
from pynvml import nvmlDeviceGetCount, nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo | |
def format_bytes(nbytes: int) -> str: | |
if nbytes == 0: | |
return "0 B" | |
unit = ("B", "kB", "MB", "GB", "TB") |
# If we open a session/job that's on a host that starts with gpu* (e.g. gpu512.dodrio.os), | |
# load PyTorch with CUDA and pdsh | |
# This makes sure that deepspeed/pdsh work in multi node settings | |
if [[ $(hostname) == gpu* ]]; then | |
module load PyTorch/1.12.0-foss-2022a-CUDA-11.7.0; | |
module load pdsh/2.34-GCCcore-11.3.0; | |
fi | |
# Automatically generates a hostfile for the current job in the current directory, |
# See https://gist.github.com/BramVanroy/f78530673b1437ed0d6be7c61cdbdd7c | |
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, HyperOptArguments)) | |
try: | |
# Assumes that the first .json file is the config file (if any) | |
config_file = next(iter(arg for arg in sys.argv if arg.endswith(".json"))) | |
except StopIteration: | |
config_file = None | |
run_name_specified = False |
def set_seed(seed: Optional[int]): | |
if seed is not None: | |
torch.manual_seed(seed) | |
torch.cuda.manual_seed_all(seed) | |
torch.backends.cudnn.deterministic = True | |
torch.backends.cudnn.benchmark = False | |
np.random.seed(seed) | |
random.seed(seed) | |
os.environ["PYTHONHASHSEED"] = str(seed) |
# If there is an error in nvidia-smi, log it to a file in ~/gpu-errors! | |
nvidia_smi_output=$(nvidia-smi) | |
if echo "nvidia_smi_output" | grep -q "ERR"; then | |
fname=~/gpu-errors/$(hostname)-error.txt | |
pdir=$(dirname "$fname") | |
mkdir -p "$pdir" | |
nvcc_output=$(nvcc --version) | |
echo "$nvidia_smi_output"$'\n'"$nvcc_version_output" > "$fname" | |
fi |
import importlib | |
from dataclasses import dataclass, field | |
from pathlib import Path | |
from typing import Optional | |
from transformers import HfArgumentParser, AutoConfig, AutoTokenizer | |
@dataclass | |
class ScriptArguments: |