データの準備
from transformers import AutoTokenizer
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler, RandomSampler
def read_file(filepath,label_mapping):
df = pd.read_csv(filepath,names=('labels', 'sentence'))
temp_labels = [label_mapping[temp] for temp in df['labels'].to_list()]
#print(temp_labels)
df['label']=temp_labels
print(df)
return df
def make_dataset(df):
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-v2")
encoding = tokenizer(df['sentence'].tolist(), return_tensors='pt', padding=True, truncation=True, max_length=512)
#print ("3 toekn=", tokenizer.convert_ids_to_tokens([3])) [SEP]
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']
train_labels = torch.tensor(df['label'].tolist())
#データセット作成
dataset = TensorDataset(input_ids, attention_mask, train_labels)
return dataset
def make_dataloader(dataset,batch_size):
dataloader=DataLoader(
dataset,
sampler = RandomSampler(dataset),
batch_size = batch_size
)
return dataloader
train_file = '../data/train.csv'
train_df = read_file(train_file,label_mapping)
train_dataset = make_dataset(train_df)
train_dataloader = make_dataloader(train_dataset,batch_size)
モデルの定義
self.bert
にはattention_maskを明示的に入力することができるが、今回はinput_idsのみ入力する。
from transformers import BertModel
import torch
from torch import nn
class BertForClassification(nn.Module):
'''3 class classifiction model based on BERT model for author recognition of Japanese novels'''
def __init__(self):
super(BertForClassification, self).__init__()
# BERT
self.bert = BertModel.from_pretrained('cl-tohoku/bert-base-japanese-v2')
# add one linear layer to the first token corresponding to [CLS] at the final layer
# BERT final layer 768 dimension x 512 tokens
# the added linear layer has 3 units (that means 3 category classification)
self.classifier = nn.Linear(in_features=768, out_features=3)
# initalizing weights and bias
nn.init.normal_(self.classifier.weight, std=0.02)
nn.init.normal_(self.classifier.bias, 0)
def forward(self, input_ids): #defining network
'''
input_ids: [batch_size, sequence_length] (token ids of texts)
'''
# defining forward network using BERT outputs
result = self.bert(input_ids) # result contains "sequence_output" and "pooled_output"
# get first token embedding from the "sequence_output"
sequence_output = result[0] # The 0 postion indicates "sequence_output" is [batch, length, hidden_size]
cls_emb = sequence_output[:, 0] # # get the first token embedding (corresponding to [CLS] token) from the sequence_output
output = self.classifier(cls_emb) # apply final one layer
return output
学習パラメータと学習率の設定
def parameter_setting(model):
# 1. first, gradient options are set to False
for param in model.parameters():
param.requires_grad = False
# 2. To train the weights in the final 4 layers BertLayer module,
# their gradient options are set to Trule
for param in model.bert.encoder.layer[-1].parameters():
param.requires_grad = True
# 3. The final layer's weights should be trained. Then, gradient option is set to them.
for param in model.classifier.parameters():
param.requires_grad = True
def set_optimizer(model):
# steps of optimization are defined depending on the layers. f
# In the weights of BERT, the steps of optimization should be small, instead,
# the added final layer should take a little larger step in optimization.
optimizer = optim.Adam([
{'params': model.bert.encoder.layer[-1].parameters(), 'lr': 5e-5},
{'params': model.classifier.parameters(), 'lr': 1e-4}
])
return optimizer
その他
基本的なパーツは上記の通りだが、これらの関数の実行とTrainループの修正が必要となるので、うまく動くように実装すること。