This Jupyter notebook demonstrates the process of fine-tuning a RoBERTa (Robustly optimized Bidirectional Encoder Representations from Transformers) model using pytorch and the 🤗 libraries.
Load the necessary libraries
!pip install -qU transformers evaluate accelerate
!pip install -qU torch torchvision torchaudio
!pip install -qU huggingface-hub
from huggingface_hub import notebook_login, HfApi,HfFolder
from datasets import load_dataset
import torch
import ipywidgets as widgets
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import (
AutoTokenizer,
DefaultDataCollator,
AutoModelForQuestionAnswering,
TrainingArguments,
Trainer,
default_data_collator,
get_scheduler,
)
from accelerate import Accelerator
from tqdm.auto import tqdm
import numpy as np
import collections
import evaluate
Hugging Face API Token notebook login:
notebook_login()
VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…
First we'll download the dataset from Hugging Face
squad = load_dataset('squad_v2',use_auth_token=True)
Reusing dataset squad_v2 (/root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)
0%| | 0/2 [00:00<?, ?it/s]
model_checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,use_auth_token=True)
We will preprocess our training and validation dataset using a custom function preprocess_examples.
We'll do the following:
This is accomplished via a custom function preprocess_examples.
from lib.utils import preprocess_examples
train_dataset = squad['train'].map(
preprocess_examples,
batched=True,
remove_columns=squad['train'].column_names,
fn_kwargs = {
'tokenizer':tokenizer,
}
)
Loading cached processed dataset at /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d/cache-9cf05c80fcf103ec.arrow
Set is_test=True
to retrieve additional columns
validation_dataset = squad['validation'].map(
preprocess_examples,
batched=True,
remove_columns=squad['validation'].column_names,
fn_kwargs = {
'tokenizer':tokenizer,
'is_test':True,
}
)
0%| | 0/12 [00:00<?, ?ba/s]
We prepare our dataloaders. This training is best run on a CUDA device, i.e. a GPU. If you're running into 'CUDA out of memory' errors, try lowering the batch_size.
train_dataset.set_format("torch")
eval_dataset = validation_dataset.remove_columns(["example_id", "offset_mapping"])
eval_dataset.set_format("torch")
train_dataloader = DataLoader(
train_dataset,
shuffle=True,
collate_fn=default_data_collator,
batch_size=16
)
eval_dataloader = DataLoader(
eval_dataset,
collate_fn=default_data_collator,
batch_size=16
)
We first get set up for training:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
optimizer = AdamW(model.parameters(),lr = 3e-5)
accelerator = Accelerator(mixed_precision="fp16")
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
num_train_epochs=3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
'linear',
optimizer=optimizer,
num_warmup_steps = 0,
num_training_steps = num_training_steps,
)
output_dir = 'roberta-finetuned-squad-v2-accelerate'
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias'] - This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Finally, we'll execute our training loop. Notice that we don't need to include methods moving the model or batches to our CUDA device - accelerators handles this automatically.
from lib.utils import compute_metrics
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
# Training step
model.train()
for step, batch in enumerate(train_dataloader):
outputs = model(**batch)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
# Eval step
model.eval()
start_logits = []
end_logits = []
accelerator.print('Evaluation!')
for batch in tqdm(eval_dataloader):
with torch.no_grad():
outputs = model(**batch)
start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())
# Concatenate logit arrays from batches
start_logits = np.concatenate(start_logits)
end_logits = np.concatenate(end_logits)
start_logits = start_logits[: len(validation_dataset)]
end_logits = end_logits[: len(validation_dataset)]
# Compute and report metrics
metrics = compute_metrics(
start_logits, end_logits, validation_dataset, squad['validation']
)
print(f"epoch {epoch}:", metrics)
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir,save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(output_dir)
0%| | 0/24717 [00:00<?, ?it/s]
Evaluation!
0%| | 0/761 [00:00<?, ?it/s]
0%| | 0/11873 [00:00<?, ?it/s]
epoch 0: {'exact': 77.7478312136781, 'f1': 80.8702752323304, 'total': 11873, 'HasAns_exact': 78.91363022941971, 'HasAns_f1': 85.16747264397094, 'HasAns_total': 5928, 'NoAns_exact': 76.58536585365853, 'NoAns_f1': 76.58536585365853, 'NoAns_total': 5945, 'best_exact': 77.7478312136781, 'best_exact_thresh': 0.0, 'best_f1': 80.87027523233047, 'best_f1_thresh': 0.0} Evaluation!
0%| | 0/761 [00:00<?, ?it/s]
0%| | 0/11873 [00:00<?, ?it/s]
epoch 1: {'exact': 79.81133664617198, 'f1': 82.94151794597373, 'total': 11873, 'HasAns_exact': 78.2051282051282, 'HasAns_f1': 84.4744673705377, 'HasAns_total': 5928, 'NoAns_exact': 81.41295206055509, 'NoAns_f1': 81.41295206055509, 'NoAns_total': 5945, 'best_exact': 79.81133664617198, 'best_exact_thresh': 0.0, 'best_f1': 82.94151794597376, 'best_f1_thresh': 0.0} Evaluation!
0%| | 0/761 [00:00<?, ?it/s]
0%| | 0/11873 [00:00<?, ?it/s]
epoch 2: {'exact': 80.45986692495578, 'f1': 83.52543495807724, 'total': 11873, 'HasAns_exact': 78.69433198380567, 'HasAns_f1': 84.83425932139885, 'HasAns_total': 5928, 'NoAns_exact': 82.22035323801514, 'NoAns_f1': 82.22035323801514, 'NoAns_total': 5945, 'best_exact': 80.45986692495578, 'best_exact_thresh': 0.0, 'best_f1': 83.52543495807726, 'best_f1_thresh': 0.0}
tokenizer.save_pretrained(output_dir)
('roberta-finetuned-squad-v2-accelerate-run2/tokenizer_config.json', 'roberta-finetuned-squad-v2-accelerate-run2/special_tokens_map.json', 'roberta-finetuned-squad-v2-accelerate-run2/vocab.json', 'roberta-finetuned-squad-v2-accelerate-run2/merges.txt', 'roberta-finetuned-squad-v2-accelerate-run2/added_tokens.json', 'roberta-finetuned-squad-v2-accelerate-run2/tokenizer.json')
{'exact': 80.45986692495578,
'f1': 83.52543495807724,
'total': 11873,
'HasAns_exact': 78.69433198380567,
'HasAns_f1': 84.83425932139885,
'HasAns_total': 5928,
'NoAns_exact': 82.22035323801514,
'NoAns_f1': 82.22035323801514,
'NoAns_total': 5945}
Inference using our saved model doesn't require much code - but don't forget to set the handle_impossible_answers option so that the pipeline properly handles the questions with impossible answers correctly - it will output '' for such a question.
repo_id = 'etweedy/roberta-base-squad-v2'
model = AutoModelForQuestionAnswering.from_pretrained(repo_id)
tokenizer = AutoTokenizer.from_pretrained(repo_id)
Downloading (…)lve/main/config.json: 0%| | 0.00/681 [00:00<?, ?B/s]
Downloading model.safetensors: 0%| | 0.00/496M [00:00<?, ?B/s]
Downloading (…)okenizer_config.json: 0%| | 0.00/351 [00:00<?, ?B/s]
Downloading (…)olve/main/vocab.json: 0%| | 0.00/798k [00:00<?, ?B/s]
Downloading (…)olve/main/merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]
Downloading (…)/main/tokenizer.json: 0%| | 0.00/2.11M [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json: 0%| | 0.00/280 [00:00<?, ?B/s]
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
repo_id = "etweedy/roberta-base-squad-v2"
QA_pipeline = pipeline('question-answering', model=repo_id, tokenizer=repo_id, handle_impossible_answer=True)
input = {
'question': 'Who invented Twinkies?',
'context': 'Twinkies were invented on April 6, 1930, by Canadian-born baker James Alexander Dewar for the Continental Baking Company in Schiller Park, Illinois.'
}
response = QA_pipeline(**input)
response
{'score': 0.9599111080169678, 'start': 64, 'end': 85, 'answer': 'James Alexander Dewar'}
input = {
'question': 'When was James Alexander Dewar born?',
'context': 'Twinkies were invented on April 6, 1930, by Canadian-born baker James Alexander Dewar for the Continental Baking Company in Schiller Park, Illinois.'
}
response = QA_pipeline(**input)
response
{'score': 0.9994915127754211, 'start': 0, 'end': 0, 'answer': ''}