基本的には以下のように、TrainingArgumentsで色々と設定して、Trainerインスタンスを作成して学習させる。
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
from sklearn.metrics import classification_report, accuracy_score
def compute_metrics(result):
labels = result.label_ids
preds = result.predictions.argmax(-1)
acc = accuracy_score(labels, preds)
return {
"accuracy": acc,
}
training_args = TrainingArguments(
output_dir='./model', # モデルのトレーニング結果を保存するディレクトリ
num_train_epochs=20, # エポック数
per_device_train_batch_size=32, # バッチサイズ
per_device_eval_batch_size = 32,
warmup_steps = 500, # 学習係数が0からこのステップ数で上昇
weight_decay = 0.01, # 重みの減衰率
save_total_limit=1,
save_strategy='epoch',
evaluation_strategy="epoch", # 評価のタイミング(stepsまたはepoch)
eval_steps=100, # 何ステップごとに評価を実行するか
load_best_model_at_end=True,
metric_for_best_model="accuracy", # compute_metrics の return から選ぶ
)
trainer = Trainer(
model=model, # トレーニング対象のモデル
args=training_args, # TrainingArguments
compute_metrics = compute_metrics,
train_dataset=train_dataset, # トレーニングデータセット
eval_dataset=test_dataset, # 評価データセット
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
trainer.train()
モデルの作成の注意点
以下のようにModelOutput(loss=loss,logits=logits)
のようなタプルでLossとLogitsを返す必要がある。
class NeuralNetwork(nn.Module):
def __init__(self, loss_fn=None):
super().__init__()
self.loss_fn = loss_fn
self.linear_relu_stack = nn.Sequential(
nn.Linear(3010, 200),
nn.ReLU(),
nn.Linear(200, 3)
)
# 初期化
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01)
nn.init.constant_(m.bias, 0)
def forward(self,
input_ids: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
) -> Union[Tuple[torch.Tensor], ModelOutput]:
logits = self.linear_relu_stack(input_ids)
if labels is not None and self.loss_fn is not None:
loss = self.loss_fn(logits, labels)
return ModelOutput(
loss=loss,
logits=logits,
)
LoRAを適用する場合の注意点
基本的にはHuggingfaceのTransformersライブラリのモデルの実装を読んで参考にするのが吉。
以下の例はBERTのクラス分類を最終層から4層分のCLSで分類するように改変したコード。
class BertForClassification(BertPreTrainedModel):
'''3 class classifiction model based on BERT model for author recognition of Japanese novels'''
def __init__(self, config, loss_fn=None):
super().__init__(config)
self.config = config
self.num_labels = config.num_labels
self.loss_fn = loss_fn
# BERT
self.bert = BertModel(config)
# add one linear layer to the first token corresponding to [CLS] at the final layer
# BERT final layer 768 dimension x 512 tokens
# the added linear layer has 3 units (that means 3 category classification)
self.cls = nn.Linear(in_features=config.hidden_size, out_features=config.num_labels)
# initalizing weights and bias
nn.init.normal_(self.cls.weight, std=0.02)
nn.init.normal_(self.cls.bias, 0)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: #defining network
'''
input_ids: [batch_size, sequence_length] (token ids of texts)
'''
# defining forward network using BERT outputs
result = self.bert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
) # result contains "sequence_output" and "pooled_output"
# get first token embedding from the "sequence_output"
vec_0 = result[0] # The 0 postion indicates "sequence_output" vec_0 is [batch, length, hidden_size]
vec_0 = vec_0[:, 0, :] # extract all outputs of 0 position in a mini batch
# Thus, now vec_0 is [batch, 1, hidden_size]
vec_0 = vec_0.view(-1, 768) # convert dim of vec_0 to [batch_size, hidden_size]
output = self.cls(vec_0) # apply final one layer
logits = output
if labels is not None and self.loss_fn is not None:
loss = self.loss_fn(logits, labels)
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=result.hidden_states,
attentions=result.attentions,
)
また、LoRAを適用する際にはtask_typeを正しく設定しないとTrainerを使用した際にエラーが出ることがある。Trainerを使用しない場合は実装方法によってはtask_typeを設定しなくてもエラーが出ないので、ハマらないように注意が必要。
- “SEQ_CLS”: PeftModelForSequenceClassification,
- “SEQ_2_SEQ_LM”: PeftModelForSeq2SeqLM,
- “CAUSAL_LM”: PeftModelForCausalLM,
- “TOKEN_CLS”: PeftModelForTokenClassification,
- “QUESTION_ANS”: PeftModelForQuestionAnswering,
- “FEATURE_EXTRACTION”: PeftModelForFeatureExtraction,
config = LoraConfig(
r=8,
lora_alpha=16,
target_modules=['query','key','value','attention.output.dense'],
lora_dropout=0.05,
bias="none",
#bias="lora_only",
task_type="SEQ_CLS",
modules_to_save=['cls'] # inject_adapter_in_model だと効かないみたい
)
model = get_peft_model(model, config)