Last active
June 29, 2023 14:11
-
-
Save Dref360/6b533ab664144cd4746f9f7e61b19fd4 to your computer and use it in GitHub Desktop.
Example on how to use Baal for NER usecases using HuggingFace.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datasets import load_dataset | |
from transformers import pipeline, DataCollatorForTokenClassification | |
from baal.active.active_loop import ActiveLearningLoop | |
from baal.active.dataset import ActiveLearningDataset | |
from baal.active.heuristics import BALD | |
from baal.bayesian.dropout import patch_module | |
from baal.transformers_trainer_wrapper import BaalTransformersTrainer | |
dataset = load_dataset("conll2003") | |
pipeline = pipeline('ner', model='issifuamajeed/distilbert-base-uncased-finetuned-ner') | |
tokenizer = pipeline.tokenizer | |
tokenizer.model_max_length = 150 | |
def align_labels_with_tokens(labels, word_ids): | |
new_labels = [] | |
current_word = None | |
for word_id in word_ids: | |
if word_id != current_word: | |
# Start of a new word! | |
current_word = word_id | |
label = -100 if word_id is None else labels[word_id] | |
new_labels.append(label) | |
elif word_id is None: | |
# Special token | |
new_labels.append(-100) | |
else: | |
# Same word as previous token | |
label = labels[word_id] | |
# If the label is B-XXX we change it to I-XXX | |
if label % 2 == 1: | |
label += 1 | |
new_labels.append(label) | |
return new_labels | |
# Tokenize dataset | |
def tokenize_and_align_labels(examples): | |
tokenized_inputs = tokenizer( | |
examples["tokens"], truncation=True, is_split_into_words=True, padding='max_length' | |
) | |
all_labels = examples["ner_tags"] | |
new_labels = [] | |
for i, labels in enumerate(all_labels): | |
word_ids = tokenized_inputs.word_ids(i) | |
new_labels.append(align_labels_with_tokens(labels, word_ids)) | |
tokenized_inputs["labels"] = new_labels | |
return tokenized_inputs | |
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, | |
remove_columns=dataset["train"].column_names, ) | |
# BAAL: Setup ALDataset and label 100 examples. | |
al_dataset = ActiveLearningDataset(dataset=tokenized_dataset['train']) | |
al_dataset.label_randomly(100) | |
# Apply MC-Dropout, create trainer and loop objects | |
model = patch_module(pipeline.model) | |
init_weights = model.state_dict() | |
trainer = BaalTransformersTrainer(model=model, | |
train_dataset=al_dataset, | |
eval_dataset=tokenized_dataset['validation'], | |
data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer), | |
tokenizer=tokenizer) | |
loop = ActiveLearningLoop(dataset=al_dataset, | |
get_probabilities=trainer.predict_on_dataset, | |
heuristic=BALD(reduction='sum'), query_size=100) | |
"""Prediction piece""" | |
# Shape [Batch_size, Num-Tokens, Probabilities, Iterations] | |
predictions = trainer.predict_on_dataset(tokenized_dataset['test'], iterations=10) | |
# Predictions with Class first [batch_size, Probabilities, Num Tokens, Iteration] | |
next_to_label = BALD(reduction='sum')(predictions.swapaxes(1, 2)) | |
uncertainties = BALD().get_uncertainties(predictions.swapaxes(1, 2)) | |
"""Training Piece""" | |
for _ in range(2): | |
trainer.load_state_dict(init_weights) | |
print(f"Active learning: labelled={al_dataset.n_labelled} unlabelled={al_dataset.n_unlabelled}") | |
trainer.train() | |
trainer.lr_scheduler = None | |
trainer.evaluate() | |
loop.step() |
Are you sure you are using the branch feat/handle_ner
@ayushkm2799 ?
I updated map_on_tensor
to handle this issue specifically.
https://github.com/baal-org/baal/pull/263/files#diff-a5206cabfdb30f2ab85f9320276e3cbcbb0c86cfe5f4fcd03a94ca63721b6d91L4
You can also book a meeting on Calendly so that we can debug the issue together.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
hi @Dref360,
I already tried this thing but I am getting this error same error comes in your code as well.
Exception has occurred: AttributeError (note: full exception trace is shown but execution is paused at: )
'size'
KeyError: 'size'
During handling of the above exception, another exception occurred:
File "/home/ubuntu/ayuskm/Active_Learning/baal/baal/utils/array_utils.py", line 40, in stack_in_memory
input_shape = data.size()
File "/home/ubuntu/ayuskm/Active_Learning/baal/baal/transformers_trainer_wrapper.py", line 74, in
lambda element: map_on_tensor(lambda d: stack_in_memory(d, iterations), element),
File "/home/ubuntu/ayuskm/Active_Learning/baal/baal/utils/iterutils.py", line 10, in map_on_tensor
return fn(val)
File "/home/ubuntu/ayuskm/Active_Learning/baal/baal/transformers_trainer_wrapper.py", line 74, in
lambda element: map_on_tensor(lambda d: stack_in_memory(d, iterations), element),
File "/home/ubuntu/ayuskm/Active_Learning/baal/baal/utils/iterutils.py", line 10, in map_on_tensor
return fn(val)
File "/home/ubuntu/ayuskm/Active_Learning/baal/baal/transformers_trainer_wrapper.py", line 73, in predict_on_dataset_generator
inputs = map_on_tensor(
File "/home/ubuntu/ayuskm/Active_Learning/baal/baal/transformers_trainer_wrapper.py", line 111, in predict_on_dataset
preds = list(
File "/home/ubuntu/ayuskm/Active_Learning/baal/baal1.py", line 82, in (Current frame)
predictions = trainer.predict_on_dataset(tokenized_dataset['test'], iterations=10)
AttributeError: