Last active
February 14, 2021 14:48
-
-
Save sharvaridhote/60f6005a6c9fa171fd7c8cd05d6e117e to your computer and use it in GitHub Desktop.
Spacy train function
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def training(train_texts, train_cats, dev_texts, dev_cats, test_texts, test_cats, L2, learn_rate, n_iter, output_dir=None): | |
| """ | |
| Spacy example function modified | |
| Trains citation needed classifier and saves model | |
| Parameters: | |
| train_texts :str -list - text train features | |
| train_cats :str - list - label citation sentence - TRUE else FALSE | |
| dev_texts :str - list - text train features | |
| dev_cats :str - list - label citation sentence - TRUE else FALSE | |
| test_texts :str - list - text train features | |
| test_cats :str - list - label citation sentence - TRUE else FALSE | |
| L2 : int - regularization parameter - default value 1e-6 | |
| learn_rate : learning rate - default rate - 0.001, | |
| output_dir :str = None - path to save the model | |
| returns: | |
| returns list of evaluated metrics (accuracy, f1, precision and recall) | |
| train_results : list - evaluated metrics for training dataset | |
| val_results : list - evaluated metrics for validation dataset | |
| """ | |
| if output_dir is not None: | |
| output_dir = Path(output_dir) | |
| if not output_dir.exists(): | |
| output_dir.mkdir() | |
| # Disabling other components | |
| nlp = spacy.load('en_core_web_sm') | |
| # Adding the built-in textcat component to the pipeline. | |
| textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}) | |
| nlp.add_pipe(textcat, last=True) | |
| # Adding the labels to textcat | |
| textcat.add_label("POSITIVE") | |
| textcat.add_label("NEGATIVE") | |
| other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] | |
| with nlp.disable_pipes(*other_pipes): # only train textcat | |
| optimizer = nlp.begin_training() | |
| optimizer.L2 = L2 | |
| optimizer.learn_rate = learn_rate | |
| dec = decaying(0.6, 0.2, 1e-4) | |
| #dec = decaying(10.0, 1.0, 0.001) | |
| print("Training the model...") | |
| print('{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'A_train', 'A_dev', 'A_test', 'P', 'R', 'F')) | |
| train_results = [] | |
| dev_results = [] | |
| test_results = [] | |
| # Performing training | |
| for i in range(n_iter): | |
| losses = {} | |
| train_data = list(zip(train_texts, [{'cats': cats} for cats in train_cats])) | |
| random.shuffle(train_data) | |
| # (train_texts, train_cats) = zip(*train_data) | |
| batches = minibatch(train_data, size=compounding(4., 32., 1.001)) | |
| for batch in batches: | |
| texts, annotations = zip(*batch) | |
| nlp.update(texts, annotations, sgd=optimizer, drop=next(dec), | |
| losses=losses) | |
| # Calling the evaluate() function and printing the train scores | |
| scores1 = evaluate(nlp.tokenizer, textcat, train_texts, train_cats) | |
| train_results.append(scores1) | |
| # Calling the evaluate() function and printing the test scores | |
| with textcat.model.use_params(optimizer.averages): | |
| scores2 = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) | |
| scores3 = evaluate(nlp.tokenizer, textcat, test_texts, test_cats) | |
| dev_results.append(scores2) | |
| test_results.append(scores3) | |
| print('{0:.4f}\t{1:.4f}\t{2:.4f}\t{3:.4f}\t{4:.4f}\t{5:.4f}\t{6:.4f}' | |
| .format(losses['textcat'], scores1['textcat_a'], scores2['textcat_a'], scores3['textcat_a'], | |
| scores1['textcat_p'], | |
| scores1['textcat_r'], scores1['textcat_f'])) | |
| if output_dir is not None: | |
| with nlp.use_params(optimizer.averages): | |
| nlp.to_disk(output_dir) | |
| print("Saved model to", output_dir) | |
| return train_results, dev_results, test_results |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment