-
-
Save g-i-o-r-g-i-o/d9d10e7ddbad1cb6f52b79e00396d851 to your computer and use it in GitHub Desktop.
Train GPT-2 from Scratch on your own language(Persain) | GPT-2 Training on non-english text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from simpletransformers.language_modeling import LanguageModelingModel | |
import logging | |
logging.basicConfig(level=logging.INFO) | |
transformers_logger = logging.getLogger("transformers") | |
transformers_logger.setLevel(logging.WARNING) | |
train_args = { | |
"reprocess_input_data": True, | |
"overwrite_output_dir": True, | |
"num_train_epochs": 5, | |
"save_eval_checkpoints": True, | |
"block_size": 509, | |
"max_seq_length": 509, | |
# "save_model_every_epoch": False, | |
"learning_rate": 1e-4, | |
"train_batch_size": 4, | |
"gradient_accumulation_steps": 4, | |
"mlm": False, | |
"dataset_type": "simple", | |
"logging_steps": 100, | |
"evaluate_during_training": True, | |
"evaluate_during_training_steps": 10000, | |
"evaluate_during_training_verbose": True, | |
"use_cached_eval_features": True, | |
"sliding_window": True, | |
"use_multiprocessing": True, | |
"vocab_size": 100000, | |
"output_dir": f"outputs/from_scratch", | |
"best_model_dir": f"outputs/from_scratch/best_model" | |
} | |
#https://github.com/miladfa7/Persian-Wikipedia-Dataset | |
train_file = f"Persian-WikiText-all.txt" | |
test_file = f"test.txt" | |
model = LanguageModelingModel( | |
"gpt2", | |
None,# if None then training from scratch | |
# /path/to/pretrain_model/ if set path then fine-tune model | |
args=train_args, | |
train_files=train_file, | |
) | |
# model.train_tokenizer(train_file) | |
model.train_model( | |
train_file, | |
eval_file=test_file, | |
) | |
model.eval_model(test_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment