按照最佳实践,BERT模型和分词器的训练数据不一定要完全相同,但应该保持一定的重叠,以便它们能够更好地彼此配合。例如,在使用BERT进行文本分类时,您可以分别对分词器和模型使用相同的数据集。以下是一个简单的示例:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
# 训练集和测试集的数据
train_data = [
"This is a positive sentence.",
"This is a negative sentence."
]
test_data = [
"This is a neutral sentence.",
"This sentence contains both positive and negative sentiment."
]
# 分词器和模型单独进行训练
train_encodings = tokenizer(train_data, truncation=True, padding=True)
train_labels = torch.tensor([1, 0]) # 1代表正面,0代表负面
test_encodings = tokenizer(test_data, truncation=True, padding=True)
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'])
# 制定评估指标和优化器,开始训练模型
model.eval()
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()
for _ in range(3):
for batch in train_dataloader:
optimizer.zero_grad()
input_ids, attention_mask, labels = batch
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = loss_fn(outputs.logits, labels)
loss.backward()
optimizer.step()
# 对测试数据进行预测
model.eval()
predictions = []
for batch in test_dataloader:
input_ids