BERTの基本

データの準備

from transformers import AutoTokenizer
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler, RandomSampler

def read_file(filepath,label_mapping):
    df = pd.read_csv(filepath,names=('labels', 'sentence'))
    temp_labels = [label_mapping[temp]  for temp in df['labels'].to_list()]
    #print(temp_labels)
    df['label']=temp_labels
    print(df)    
    return df

def make_dataset(df):
    tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-v2")
    encoding = tokenizer(df['sentence'].tolist(), return_tensors='pt', padding=True, truncation=True, max_length=512)
    #print ("3 toekn=", tokenizer.convert_ids_to_tokens([3])) [SEP]
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    train_labels = torch.tensor(df['label'].tolist())
    #データセット作成
    dataset = TensorDataset(input_ids, attention_mask, train_labels)
    return dataset

def make_dataloader(dataset,batch_size):
    dataloader=DataLoader(
        dataset,
        sampler = RandomSampler(dataset),
        batch_size = batch_size
    )
    return dataloader


train_file = '../data/train.csv'
train_df = read_file(train_file,label_mapping)
train_dataset = make_dataset(train_df)
train_dataloader = make_dataloader(train_dataset,batch_size)

モデルの定義

self.bertにはattention_maskを明示的に入力することができるが、今回はinput_idsのみ入力する。

from transformers import BertModel
import torch
from torch import nn

class BertForClassification(nn.Module):
    '''3 class classifiction model based on BERT model for author recognition of Japanese novels'''

    def __init__(self):
        super(BertForClassification, self).__init__()

        # BERT
        self.bert = BertModel.from_pretrained('cl-tohoku/bert-base-japanese-v2')

        # add one linear layer to the first token corresponding to [CLS] at the final layer 
        # BERT final layer 768 dimension x 512 tokens
        # the added linear layer has 3 units (that means 3 category classification)
        self.classifier = nn.Linear(in_features=768, out_features=3)

        # initalizing weights and bias
        nn.init.normal_(self.classifier.weight, std=0.02)
        nn.init.normal_(self.classifier.bias, 0)

    def forward(self, input_ids): #defining network 
        '''
        input_ids: [batch_size, sequence_length] (token ids of texts)
        '''

        # defining forward network using BERT outputs
        result = self.bert(input_ids)  # result contains "sequence_output" and "pooled_output"

        # get first token embedding from the "sequence_output"
        sequence_output = result[0]  # The 0 postion indicates "sequence_output" is [batch, length, hidden_size]
        cls_emb = sequence_output[:, 0] # # get the first token embedding (corresponding to [CLS] token) from the sequence_output
        output = self.classifier(cls_emb)  # apply final one layer 

        return output

学習パラメータと学習率の設定

def parameter_setting(model):
    # 1. first, gradient options are set to False
    for param in model.parameters():
        param.requires_grad = False

    # 2. To train the weights in the final 4 layers BertLayer module,
    #    their gradient options are set to Trule
    for param in model.bert.encoder.layer[-1].parameters():
        param.requires_grad = True

    # 3. The final layer's weights should be trained. Then, gradient option is set to them. 
    for param in model.classifier.parameters():
        param.requires_grad = True

def set_optimizer(model):
    # steps of optimization are defined depending on the layers. f
    # In the weights of BERT, the steps of optimization should be small, instead,
    # the added final layer should take a little larger step in optimization. 
    optimizer = optim.Adam([
     {'params': model.bert.encoder.layer[-1].parameters(), 'lr': 5e-5},
        {'params': model.classifier.parameters(), 'lr': 1e-4}
    ])
    return optimizer

その他

基本的なパーツは上記の通りだが、これらの関数の実行とTrainループの修正が必要となるので、うまく動くように実装すること。