Last active
August 19, 2024 13:27
Revisions
-
pommedeterresautee revised this gist
May 20, 2020 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -9,7 +9,7 @@ SOURCE_VIRT_ENV = source $(VIRT_ENV_FOLDER)/bin/activate train: ( \ $(SOURCE_VIRT_ENV); \ python trainer.py \ --output_dir ./models/speed_camembert_max_len_128_fp16_dynamic_padding_smart_batching_batch_64_seed_321 \ --overwrite_output_dir \ --save_steps 0 \ -
pommedeterresautee revised this gist
May 19, 2020 . 2 changed files with 0 additions and 103 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,24 +0,0 @@ This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,79 +0,0 @@ -
pommedeterresautee created this gist
May 19, 2020 .There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,31 @@ # required by (\ SHELL COMMANDS \) SHELL:=/bin/bash VIRT_ENV_FOLDER = ~/.local/share/virtualenvs/xnli SOURCE_VIRT_ENV = source $(VIRT_ENV_FOLDER)/bin/activate .PHONY: train train: ( \ $(SOURCE_VIRT_ENV); \ python prepare.py \ --output_dir ./models/speed_camembert_max_len_128_fp16_dynamic_padding_smart_batching_batch_64_seed_321 \ --overwrite_output_dir \ --save_steps 0 \ --seed 321 \ --num_train_epochs 1 \ --learning_rate 5e-5 \ --per_gpu_train_batch_size 64 \ --gradient_accumulation_steps 1 \ --per_gpu_eval_batch_size 64 \ --max_seq_len 128 \ --dynamic_padding \ --smart_batching \ --fp16 \ --evaluate_during_training ; \ ) # --smart_batching # --dynamic_padding # --fp16 This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,24 @@ # ... def pad_seq(seq: List[int], max_batch_len: int, pad_value: int) -> List[int]: return seq + (max_batch_len - len(seq)) * [pad_value] @dataclass class SmartCollator(DataCollator): pad_token_id: int def collate_batch(self, batch: List[Features]) -> Dict[str, torch.Tensor]: batch_inputs = list() batch_attention_masks = list() labels = list() max_size = max([len(ex.input_ids) for ex in batch]) for item in batch: batch_inputs += [pad_seq(item.input_ids, max_size, self.pad_token_id)] batch_attention_masks += [pad_seq(item.attention_mask, max_size, 0)] labels.append(item.label) return {"input_ids": torch.tensor(batch_inputs, dtype=torch.long), "attention_mask": torch.tensor(batch_attention_masks, dtype=torch.long), "labels": torch.tensor(labels, dtype=torch.long) } This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,79 @@ def load_train_data(path: str, sort: bool) -> List[Example]: sentences = list() with open(path) as f: first = False for line in f: if not first: first = True continue text_a, text_b, label = line.rstrip().split("\t") lab = len(text_a) + len(text_b) sentences.append((lab, Example(text_a=text_a, text_b=text_b, label=label_codes[label]))) if sort: sentences.sort(key=lambda x: x[0]) return [e for (_, e) in sentences] def load_dev_data(path: str) -> List[Example]: sentences = list() with open(path) as f: for raw_line in f: line = raw_line.split("\t") if line[0] != "fr": continue text_a = line[6] text_b = line[7] label = line[1] lab = len(text_a) + len(text_b) sentences.append((lab, Example(text_a=text_a, text_b=text_b, label=label_codes[label]))) sentences.sort(key=lambda x: x[0]) return [e for (_, e) in sentences] def build_batches(sentences: List[Example], batch_size: int) -> List[Example]: batch_ordered_sentences = list() while len(sentences) > 0: to_take = min(batch_size, len(sentences)) select = random.randint(0, len(sentences) - to_take) batch_ordered_sentences += sentences[select:select + to_take] del sentences[select:select + to_take] return batch_ordered_sentences class TextDataset(Dataset): def __init__(self, tokenizer: PreTrainedTokenizer, pad_to_max_length: bool, max_len: int, examples: List[Example]) -> None: self.tokenizer = tokenizer self.max_len = max_len self.examples: List[Example] = examples self.current = 0 self.pad_to_max_length = pad_to_max_length def encode(self, ex: Example) -> Features: encode_dict = self.tokenizer.encode_plus(text=ex.text_a, text_pair=ex.text_b, add_special_tokens=True, max_length=self.max_len, pad_to_max_length=self.pad_to_max_length, return_token_type_ids=False, return_attention_mask=True, return_overflowing_tokens=False, return_special_tokens_mask=False, ) return Features(input_ids=encode_dict["input_ids"], attention_mask=encode_dict["attention_mask"], label=ex.label) def __getitem__(self, idx) -> Features: # Trainer doesn't support IterableDataset (define sampler) if self.current == len(self.examples): self.current = 0 ex = self.examples[self.current] self.current += 1 return self.encode(ex=ex) def __len__(self): return len(self.examples) This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,256 @@ import logging import os import random import time from dataclasses import dataclass, field from typing import Dict, Optional from typing import List import numpy as np import torch from torch.utils.data.dataset import Dataset, IterableDataset from torch.utils.tensorboard import SummaryWriter from transformers import AutoTokenizer, EvalPrediction, Trainer, HfArgumentParser, TrainingArguments, \ AutoModelForSequenceClassification, set_seed, AutoConfig from transformers import PreTrainedTokenizer, DataCollator, PreTrainedModel import wandb set_seed(123) label_codes = {"contradictory": 0, "contradiction": 0, "neutral": 1, "entailment": 2} wandb.init(project="speed_training") class MyTrainer(Trainer): def _setup_wandb(self): wandb.init(project="speed_training", config=vars(self.args), name=self.args.output_dir) wandb.watch(self.model, log="gradients", log_freq=self.args.logging_steps) @dataclass class Example: text_a: str text_b: str label: int @dataclass class Features: input_ids: List[int] attention_mask: List[int] label: int @dataclass class ModelParameters: max_seq_len: Optional[int] = field( default=None, metadata={"help": "max seq len"}, ) dynamic_padding: bool = field( default=False, metadata={"help": "limit pad size at batch level"}, ) smart_batching: bool = field( default=False, metadata={"help": "build batch of similar sizes"}, ) dynamic_batch_size: bool = field( default=False, metadata={"help": "build batch of similar sizes"}, ) class TextDataset(Dataset): def __init__(self, tokenizer: PreTrainedTokenizer, pad_to_max_length: bool, max_len: int, examples: List[Example]) -> None: self.tokenizer = tokenizer self.max_len = max_len self.examples: List[Example] = examples self.current = 0 self.pad_to_max_length = pad_to_max_length def encode(self, ex: Example) -> Features: encode_dict = self.tokenizer.encode_plus(text=ex.text_a, text_pair=ex.text_b, add_special_tokens=True, max_length=self.max_len, pad_to_max_length=self.pad_to_max_length, return_token_type_ids=False, return_attention_mask=True, return_overflowing_tokens=False, return_special_tokens_mask=False, ) return Features(input_ids=encode_dict["input_ids"], attention_mask=encode_dict["attention_mask"], label=ex.label) def __getitem__(self, idx) -> Features: # Trainer doesn't support IterableDataset (define sampler) if self.current == len(self.examples): self.current = 0 ex = self.examples[self.current] self.current += 1 return self.encode(ex=ex) def __len__(self): return len(self.examples) def pad_seq(seq: List[int], max_batch_len: int, pad_value: int) -> List[int]: return seq + (max_batch_len - len(seq)) * [pad_value] @dataclass class SmartCollator(DataCollator): pad_token_id: int def collate_batch(self, batch: List[Features]) -> Dict[str, torch.Tensor]: batch_inputs = list() batch_attention_masks = list() labels = list() max_size = max([len(ex.input_ids) for ex in batch]) for item in batch: batch_inputs += [pad_seq(item.input_ids, max_size, self.pad_token_id)] batch_attention_masks += [pad_seq(item.attention_mask, max_size, 0)] labels.append(item.label) return {"input_ids": torch.tensor(batch_inputs, dtype=torch.long), "attention_mask": torch.tensor(batch_attention_masks, dtype=torch.long), "labels": torch.tensor(labels, dtype=torch.long) } def load_transformers_model(pretrained_model_name_or_path: str, use_cuda: bool, mixed_precision: bool) -> PreTrainedModel: config = AutoConfig.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path, num_labels=3) model = AutoModelForSequenceClassification.from_pretrained( pretrained_model_name_or_path=pretrained_model_name_or_path, config=config) if use_cuda and torch.cuda.is_available(): device = torch.device('cuda') model.to(device) if mixed_precision: try: from apex import amp model = amp.initialize(model, opt_level='O1') except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") return model def load_train_data(path: str, sort: bool) -> List[Example]: sentences = list() with open(path) as f: first = False for line in f: if not first: first = True continue text_a, text_b, label = line.rstrip().split("\t") lab = len(text_a) + len(text_b) sentences.append((lab, Example(text_a=text_a, text_b=text_b, label=label_codes[label]))) if sort: sentences.sort(key=lambda x: x[0]) return [e for (_, e) in sentences] def load_dev_data(path: str) -> List[Example]: sentences = list() with open(path) as f: for raw_line in f: line = raw_line.split("\t") if line[0] != "fr": continue text_a = line[6] text_b = line[7] label = line[1] lab = len(text_a) + len(text_b) sentences.append((lab, Example(text_a=text_a, text_b=text_b, label=label_codes[label]))) sentences.sort(key=lambda x: x[0]) return [e for (_, e) in sentences] def build_batches(sentences: List[Example], batch_size: int) -> List[Example]: batch_ordered_sentences = list() while len(sentences) > 0: to_take = min(batch_size, len(sentences)) select = random.randint(0, len(sentences) - to_take) batch_ordered_sentences += sentences[select:select + to_take] del sentences[select:select + to_take] return batch_ordered_sentences if __name__ == "__main__": parser = HfArgumentParser((TrainingArguments, ModelParameters)) training_args, model_args = parser.parse_args_into_dataclasses() # type: (TrainingArguments, ModelParameters) train_sentences = load_train_data(path="resources/XNLI-MT-1.0/multinli/multinli.train.fr.tsv", sort=model_args.smart_batching) tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="camembert-base") if model_args.max_seq_len: max_sequence_len = model_args.max_seq_len else: longest_sentence = max(train_sentences, key=len) max_sequence_len = len(tokenizer.encode(text=longest_sentence.text_a, text_pair=longest_sentence.text_b)) train_batches = build_batches(sentences=train_sentences, batch_size=training_args.per_gpu_train_batch_size) valid_sentences = load_dev_data(path="resources/XNLI-1.0/xnli.test.tsv") valid_batches = build_batches(sentences=valid_sentences, batch_size=training_args.per_gpu_eval_batch_size) train_set = TextDataset(tokenizer=tokenizer, max_len=max_sequence_len, examples=train_batches, pad_to_max_length=not model_args.dynamic_padding) valid_set = TextDataset(tokenizer=tokenizer, max_len=max_sequence_len, examples=valid_batches, pad_to_max_length=not model_args.dynamic_padding) model = load_transformers_model(pretrained_model_name_or_path="camembert-base", use_cuda=True, mixed_precision=False) def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": (preds == p.label_ids).mean()} trainer = MyTrainer( model=model, args=training_args, train_dataset=train_set, # data_collator=IdentityCollator(pad_token_id=tokenizer.pad_token_id), data_collator=SmartCollator(pad_token_id=tokenizer.pad_token_id), tb_writer=SummaryWriter(log_dir='logs', flush_secs=10), eval_dataset=valid_set, compute_metrics=compute_metrics, ) start_time = time.time() trainer.train() wandb.config.update(model_args) wandb.config.update(training_args) wandb.log({"training time": int((time.time() - start_time) / 60)}) trainer.save_model() trainer.evaluate() logging.info("*** Evaluate ***") result = trainer.evaluate() wandb.log(result) output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logging.info("***** Eval results *****") for key, value in result.items(): logging.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value))