from datasets import load_dataset from transformers import pipeline, DataCollatorForTokenClassification from baal.active.active_loop import ActiveLearningLoop from baal.active.dataset import ActiveLearningDataset from baal.active.heuristics import BALD from baal.bayesian.dropout import patch_module from baal.transformers_trainer_wrapper import BaalTransformersTrainer dataset = load_dataset("conll2003") pipeline = pipeline('ner', model='issifuamajeed/distilbert-base-uncased-finetuned-ner') tokenizer = pipeline.tokenizer tokenizer.model_max_length = 150 def align_labels_with_tokens(labels, word_ids): new_labels = [] current_word = None for word_id in word_ids: if word_id != current_word: # Start of a new word! current_word = word_id label = -100 if word_id is None else labels[word_id] new_labels.append(label) elif word_id is None: # Special token new_labels.append(-100) else: # Same word as previous token label = labels[word_id] # If the label is B-XXX we change it to I-XXX if label % 2 == 1: label += 1 new_labels.append(label) return new_labels # Tokenize dataset def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples["tokens"], truncation=True, is_split_into_words=True, padding='max_length' ) all_labels = examples["ner_tags"] new_labels = [] for i, labels in enumerate(all_labels): word_ids = tokenized_inputs.word_ids(i) new_labels.append(align_labels_with_tokens(labels, word_ids)) tokenized_inputs["labels"] = new_labels return tokenized_inputs tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset["train"].column_names, ) # BAAL: Setup ALDataset and label 100 examples. al_dataset = ActiveLearningDataset(dataset=tokenized_dataset['train']) al_dataset.label_randomly(100) # Apply MC-Dropout, create trainer and loop objects model = patch_module(pipeline.model) init_weights = model.state_dict() trainer = BaalTransformersTrainer(model=model, train_dataset=al_dataset, eval_dataset=tokenized_dataset['validation'], data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer), tokenizer=tokenizer) loop = ActiveLearningLoop(dataset=al_dataset, get_probabilities=trainer.predict_on_dataset, heuristic=BALD(reduction='sum'), query_size=100) """Prediction piece""" # Shape [Batch_size, Num-Tokens, Probabilities, Iterations] predictions = trainer.predict_on_dataset(tokenized_dataset['test'], iterations=10) # Predictions with Class first [batch_size, Probabilities, Num Tokens, Iteration] next_to_label = BALD(reduction='sum')(predictions.swapaxes(1, 2)) uncertainties = BALD().get_uncertainties(predictions.swapaxes(1, 2)) """Training Piece""" for _ in range(2): trainer.load_state_dict(init_weights) print(f"Active learning: labelled={al_dataset.n_labelled} unlabelled={al_dataset.n_unlabelled}") trainer.train() trainer.lr_scheduler = None trainer.evaluate() loop.step()