Trainerについてのメモ

基本的には以下のように、TrainingArgumentsで色々と設定して、Trainerインスタンスを作成して学習させる。

from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
from sklearn.metrics import classification_report, accuracy_score


  def compute_metrics(result):
        labels = result.label_ids
        preds = result.predictions.argmax(-1)
        acc = accuracy_score(labels, preds)
        return {
            "accuracy": acc,
        }

    training_args = TrainingArguments(
        output_dir='./model',  # モデルのトレーニング結果を保存するディレクトリ
        num_train_epochs=20,      # エポック数
        per_device_train_batch_size=32,  # バッチサイズ
        per_device_eval_batch_size = 32,
        warmup_steps = 500,  # 学習係数が0からこのステップ数で上昇
        weight_decay = 0.01,  # 重みの減衰率
        save_total_limit=1,
        save_strategy='epoch',
        evaluation_strategy="epoch",   # 評価のタイミング(stepsまたはepoch)
        eval_steps=100,          # 何ステップごとに評価を実行するか
        load_best_model_at_end=True,
        metric_for_best_model="accuracy", # compute_metrics の return から選ぶ
    )

    trainer = Trainer(
        model=model,              # トレーニング対象のモデル
        args=training_args,       # TrainingArguments
        compute_metrics = compute_metrics,
        train_dataset=train_dataset,  # トレーニングデータセット
        eval_dataset=test_dataset,    # 評価データセット
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], 
    )

    trainer.train()

モデルの作成の注意点

以下のようにModelOutput(loss=loss,logits=logits)のようなタプルでLossとLogitsを返す必要がある。

class NeuralNetwork(nn.Module):
    def __init__(self, loss_fn=None):
        super().__init__()
        self.loss_fn = loss_fn

        self.linear_relu_stack = nn.Sequential(
            nn.Linear(3010, 200),
            nn.ReLU(),
            nn.Linear(200, 3)
        )
        # 初期化
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

    def forward(self,
                input_ids: Optional[torch.Tensor] = None,
                labels: Optional[torch.Tensor] = None,
            ) -> Union[Tuple[torch.Tensor], ModelOutput]:
        logits = self.linear_relu_stack(input_ids)

        if labels is not None and self.loss_fn is not None:
            loss = self.loss_fn(logits, labels)

        return  ModelOutput(
            loss=loss,
            logits=logits,
        )

LoRAを適用する場合の注意点

基本的にはHuggingfaceのTransformersライブラリのモデルの実装を読んで参考にするのが吉。
以下の例はBERTのクラス分類を最終層から4層分のCLSで分類するように改変したコード。

class BertForClassification(BertPreTrainedModel):
    '''3 class classifiction model based on BERT model for author recognition of Japanese novels'''

    def __init__(self, config, loss_fn=None):
        super().__init__(config)
        self.config = config
        self.num_labels = config.num_labels
        self.loss_fn = loss_fn

        # BERT
        self.bert = BertModel(config)

        # add one linear layer to the first token corresponding to [CLS] at the final layer 
        # BERT final layer 768 dimension x 512 tokens
        # the added linear layer has 3 units (that means 3 category classification)
        self.cls = nn.Linear(in_features=config.hidden_size, out_features=config.num_labels)

        # initalizing weights and bias
        nn.init.normal_(self.cls.weight, std=0.02)
        nn.init.normal_(self.cls.bias, 0)

    def forward(
            self,
            input_ids: Optional[torch.Tensor] = None,
            attention_mask: Optional[torch.Tensor] = None,
            token_type_ids: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.Tensor] = None,
            head_mask: Optional[torch.Tensor] = None,
            inputs_embeds: Optional[torch.Tensor] = None,
            labels: Optional[torch.Tensor] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
        ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: #defining network 
        '''
        input_ids: [batch_size, sequence_length] (token ids of texts)
        '''

        # defining forward network using BERT outputs
        result = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )  # result contains "sequence_output" and "pooled_output"

        # get first token embedding from the "sequence_output"
        vec_0 = result[0]  # The 0 postion indicates "sequence_output" vec_0 is [batch, length, hidden_size]
        vec_0 = vec_0[:, 0, :]  # extract all outputs of 0 position in a mini batch
                                # Thus, now vec_0 is [batch, 1, hidden_size] 
        vec_0 = vec_0.view(-1, 768)  # convert dim of vec_0 to [batch_size, hidden_size]
        output = self.cls(vec_0)  # apply final one layer 

        logits = output

        if labels is not None and self.loss_fn is not None:
            loss = self.loss_fn(logits, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=result.hidden_states,
            attentions=result.attentions,
        )

また、LoRAを適用する際にはtask_typeを正しく設定しないとTrainerを使用した際にエラーが出ることがある。Trainerを使用しない場合は実装方法によってはtask_typeを設定しなくてもエラーが出ないので、ハマらないように注意が必要。

  • “SEQ_CLS”: PeftModelForSequenceClassification,
  • “SEQ_2_SEQ_LM”: PeftModelForSeq2SeqLM,
  • “CAUSAL_LM”: PeftModelForCausalLM,
  • “TOKEN_CLS”: PeftModelForTokenClassification,
  • “QUESTION_ANS”: PeftModelForQuestionAnswering,
  • “FEATURE_EXTRACTION”: PeftModelForFeatureExtraction,
    config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=['query','key','value','attention.output.dense'],
        lora_dropout=0.05,
        bias="none",
        #bias="lora_only",
        task_type="SEQ_CLS",
        modules_to_save=['cls'] # inject_adapter_in_model だと効かないみたい
    )
    model = get_peft_model(model, config)