max_length = 384
doc_stride = 128
def preprocess_training_examples(examples):
questions = [q.strip() for q in examples["question"]]
contexts = examples["context"]
answers = examples["answers"]
# Tokenize questions and contexts
inputs = tokenizer(
questions,
contexts,
max_length=max_length,
truncation="only_second",
return_offsets_mapping=True,
padding="max_length",
stride=doc_stride,
return_overflowing_tokens=True,)
offset_mapping = inputs.pop("offset_mapping")
sample_map = inputs.pop("overflow_to_sample_mapping")
start_positions = []
end_positions = []
for i, offsets in enumerate(offset_mapping):
sample_idx = sample_map[i]
answer = answers[sample_idx]
start_char = answer["answer_start"][0]
end_char = start_char + len(answer["text"][0])
sequence_ids = inputs.sequence_ids(i)
context_start_token_idx = 0
while sequence_ids[context_start_token_idx] != 1:
context_start_token_idx += 1
context_end_token_idx = len(sequence_ids) - 1
while sequence_ids[context_end_token_idx] != 1:
context_end_token_idx -= 1
token_start_position = context_start_token_idx
while token_start_position <= context_end_token_idx and offsets[token_start_position][0] <= start_char:
token_start_position += 1
start_positions.append(token_start_position - 1)
token_end_position = context_end_token_idx
while token_end_position >= context_start_token_idx and offsets[token_end_position][1] >= end_char:
token_end_position -= 1
end_positions.append(token_end_position + 1)
inputs["start_positions"] = start_positions
inputs["end_positions"] = end_positions
return inputs
train_dataset = raw_dataset["train"].map(
preprocess_training_examples,
batched=True,
remove_columns=raw_dataset["train"].column_names,
)
print(f"Original train examples: {len(raw_dataset['train'])}, Processed train examples: {len(train_dataset)}")
print(train_dataset[0])def preprocess_validation_examples(examples):
questions = [q.strip() for q in examples["question"]]
contexts = examples["context"]
inputs = tokenizer(
questions,
contexts,
max_length=max_length,
truncation="only_second",
return_offsets_mapping=True,
padding="max_length",
stride=doc_stride,
return_overflowing_tokens=True,
)
sample_map = inputs.pop("overflow_to_sample_mapping")
inputs["example_id"] = [examples["id"][i] for i in sample_map]
inputs["offset_mapping"] = inputs["offset_mapping"]
return inputs
eval_dataset = raw_dataset["validation"].map(
preprocess_validation_examples,
batched=True,
remove_columns=raw_dataset["validation"].column_names,
)
print(f"Original validation examples: {len(raw_dataset['validation'])}, Processed validation examples: {len(eval_dataset)}")
print(eval_dataset[0])squad_metric = evaluate.load("squad")
def compute_metrics(p):
start_logits, end_logits = p.predictions
all_predictions = trainer.predict(eval_dataset).predictions
start_logits, end_logits = all_predictions
predicted_answers = {}
for example in raw_dataset["validation"]:
example_id = example["id"]
predicted_answers[example_id] = example["context"].split(" ")[0]
formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predicted_answers.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in raw_dataset["validation"]]
metrics = squad_metric.compute(predictions=formatted_predictions, references=references)
return metrics
output_dir = "./distilbert-squad-lora-qa-ft"
training_args = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=16,
gradient_accumulation_steps=1,
learning_rate=3e-4,
num_train_epochs=5,
logging_steps=100,
save_steps=500,
eval_steps=500,
evaluation_strategy="steps",
fp16=True,
report_to="tensorboard",
lr_scheduler_type="linear",
warmup_ratio=0.06,
optim="adamw_torch",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
push_to_hub=False, )
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,)
trainer.train()
# trainer.save_model(output_dir)
# tokenizer.save_pretrained(output_dir)