【Bert】文本多标签分类

1.1 参考文献

复旦大学邱锡鹏老师课题组的研究论文《How to Fine-Tune BERT for Text Classification?》。

论文: https://arxiv.org/pdf/1905.05583.pdf

1.2 论文思路

旨在文本分类任务上探索不同的BERT微调方法并提供一种通用的BERT微调解决方法。这篇论文从三种路线进行了探索:

  • (1) BERT自身的微调策略,包括长文本处理、学习率、不同层的选择等方法;
  • (2) 目标任务内、领域内及跨领域的进一步预训练BERT;
  • (3) 多任务学习。微调后的BERT在七个英文数据集及搜狗中文数据集上取得了当前最优的结果。

1.3 代码来源

作者的实现代码: https://github.com/xuyige/BERT4doc-Classification

数据集来源:https://www.kaggle.com/shivanandmn/multilabel-classification-dataset?select=train.csv

该数据集包含 6 个不同的标签(计算机科学、物理、数学、统计学、生物学、金融), 根据摘要和标题对研究论文进行分类。标签列中的值 1 表示标签属于该标签。每个论文有多个标签为 1。

2.1 Import

#2.1 Import

#关于torch的安装可以参考https://blog.csdn.net/Checkmate9949/article/details/119494673?spm=1001.2014.3001.5501
import torch
from transformers import BertTokenizerFast as BertTokenizer
from utils.plot_results import plot_results
from resources.train_val_model import train_model
from resources.get_data import get_data
from resources.build_model import BertClassifier
from resources.test_model import test_model
from resources.build_dataloader import build_dataloader

2.2 Get data: 分割样本

2.2 Get data

##################################
           get data
##################################

#该函数见2.2.1
train_df, val_df, test_df = get_data()

fixed parameters
#Columns: 第三行到倒数第二行
label_columns = train_df.columns.tolist()[3:-1]

num_labels = len(label_columns)
max_token_len = 30

BERT_MODEL_NAME = "bert-base-uncased"
bert-base-uncased: for English. bert-base-Chinese
BERT_MODEL_NAME = "model/bert-base-uncased"
#分词
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
def get_data():
    df = pd.read_csv("data/train.csv")
#把标题和摘要合并作为TEXT
    df["TEXT"] = df["TITLE"] + df["ABSTRACT"]

    label_columns = df.columns.tolist()[3:-1]
    print(df[label_columns].sum().sort_values())
#Split data in to train and test: 训练集占比80%
    test_df, train_df = train_test_split(df, test_size=0.8, random_state=42)
#Split data in to valid and test: 分别占比50%
    test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)
#输出数据集
    return train_df, val_df, test_df

2.3 Build data loaders

##################################
       build data loaders
##################################
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#训练集使随机抽样random sample
#val and test: squential sample
train_dataloader = build_dataloader(
    train_df, label_columns, tokenizer, max_token_len, trainset=True
)
val_dataloader = build_dataloader(val_df, label_columns, tokenizer, max_token_len)
test_dataloader = build_dataloader(test_df, label_columns, tokenizer, max_token_len)
build_dataloader

import os
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

#df=train_df

os.environ["TOKENIZERS_PARALLELISM"] = "false"

class text_dataset(Dataset):
    def __init__( df, label_columns, tokenizer, max_token_len):
        data = df
        label_columns = label_columns
        tokenizer = tokenizer
        max_token_len = max_token_len

#返回数据长度
    def __len__(self):
        return len(data)

#根据index获取item
#index=3
    def __getitem__( index):
        data_row = data.iloc[index]
        text = data_row["TEXT"]
        labels = data_row[label_columns]
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        return dict(
            text=text,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            labels=torch.FloatTensor(labels),
        )
#test=__getitem__( index)

def build_dataloader(df, label_columns, tokenizer, max_token_len, trainset=False):
    dataset = text_dataset(df, label_columns, tokenizer, max_token_len)

#随机抽取样本
    if trainset:
        sampler = RandomSampler(df)
#有次序地抽取样本
    else:
        sampler = SequentialSampler(df)

    return DataLoader(dataset, batch_size=10, sampler=sampler)

2.4 Build model

##################################
       build model
##################################

bert_classifier = BertClassifier(
    num_labels=num_labels, BERT_MODEL_NAME=BERT_MODEL_NAME, freeze_bert=False
)
import torch
import torch.nn as nn
from transformers import BertModel

class BertClassifier(nn.Module):
    def __init__(self, num_labels: int, BERT_MODEL_NAME, freeze_bert=False):
        super().__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME)

        #  hidden size of BERT, hidden size of our classifier, and number of labels to classify
        D_in, H, D_out = self.bert.config.hidden_size, 50, num_labels

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(H, D_out),
        )
        # loss
        self.loss_func = nn.BCEWithLogitsLoss()

        if freeze_bert:
            print("freezing bert parameters")
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)

        # Extract the last hidden state of the token [CLS] for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        logits = self.classifier(last_hidden_state_cls)

        if labels is not None:
            predictions = torch.sigmoid(logits)
            loss = self.loss_func(
                predictions.view(-1, self.num_labels), labels.view(-1, self.num_labels)
            )
            return loss
        else:
            return logits

2.5 训练模型,并且通过validation选择最佳参数

##################################
    train and validate model
##################################

trained_model, training_stats, train_loss_set = train_model(
    bert_classifier,
    train_dataloader,
    val_dataloader=val_dataloader,
    epochs=5,
    evaluation=True,
)

plot_results(training_stats, train_loss_set)
import time
import random
import numpy as np
import torch
from utils.helper_functions import format_time
from transformers import AdamW, get_linear_schedule_with_warmup

def train_model(
    model, train_dataloader, val_dataloader=None, epochs=5, evaluation=False
):
    """Train and validate the BertClassifier model."""
    training_stats = []
    train_loss_set = []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer, scheduler = build_optimizer_scheduler(
        model=model, epochs=epochs, train_dataloader=train_dataloader
    )
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        print(
            f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}"
        )
        print("-" * 70)
        t0 = time.time()
        t0_epoch, t0_batch = time.time(), time.time()
        total_loss, batch_loss, batch_counts = 0, 0, 0

        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_counts += 1
            b_input_ids = batch["input_ids"].to(device)
            b_attention_mask = batch["attention_mask"].to(device)
            b_labels = batch["labels"].to(device)
            # b_attention_mask, b_labels = tuple(t.to(device) for t in batch)

            model.zero_grad()

            loss = model(
                input_ids=b_input_ids,
                attention_mask=b_attention_mask,
                labels=b_labels,
            )
            batch_loss += loss.item()
            total_loss += loss.item()
            train_loss_set.append(loss.item())

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                time_elapsed = time.time() - t0_batch
                print(
                    f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}"
                )
                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)
        training_time = format_time(time.time() - t0)

        print("-" * 70)

        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            avg_val_loss, avg_val_accuracy, validation_time = evaluate(
                model, val_dataloader
            )
            time_elapsed = time.time() - t0_epoch
            print(
                f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {avg_val_loss:^10.6f} | {avg_val_accuracy:^9.2f} | {time_elapsed:^9.2f}"
            )
            print("-" * 70)

        # save model
        if (
            len(training_stats) == 0
            or training_stats[-1]["Valid. Loss"] > avg_train_loss
        ):
            model_dir = "model/model.pt"
            torch.save(model.state_dict(), model_dir)

        training_stats.append(
            {
                "epoch": epoch_i + 1,
                "Training Loss": avg_train_loss,
                "Valid. Loss": avg_val_loss,
                "Valid. Accur.": avg_val_accuracy,
                "Training Time": training_time,
                "Validation Time": validation_time,
            }
        )

    print("\n")
    print("Training complete!")
    return model, training_stats, train_loss_set

def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.

"""
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.

    t0 = time.time()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    # Tracking variables
    avg_val_accuracy = []
    avg_val_loss = []

    # For each batch in our validation set...

    for batch in val_dataloader:
        b_input_ids = batch["input_ids"].to(device)
        b_attention_mask = batch["attention_mask"].to(device)
        b_labels = batch["labels"].to(device)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attention_mask)

        # Compute loss
        loss_func = model.loss_func
        predictions = torch.sigmoid(logits)
        loss = loss_func(
            predictions.view(-1, model.num_labels), b_labels.view(-1, model.num_labels)
        )
        avg_val_loss.append(loss.item())

        # Get the predictions
        preds = torch.round(predictions)

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        avg_val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.

    avg_val_loss = np.mean(avg_val_loss)
    avg_val_accuracy = np.mean(avg_val_accuracy)
    validation_time = format_time(time.time() - t0)
    return avg_val_loss, avg_val_accuracy, validation_time

def build_optimizer_scheduler(model, epochs, train_dataloader):

    # setting custom optimization parameters for huggingface model and implement a scheduler here as well.

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "gamma", "beta"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay_rate": 0.01,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay_rate": 0.0,
        },
    ]

    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=5e-5,  # Default learning rate
        eps=1e-8,  # Default epsilon value
    )

    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,  # Default value
        num_training_steps=total_steps,
    )

    return optimizer, scheduler

2.6 测试模型

##################################
          test model
##################################

test_model(
    test_dataloader=test_dataloader,
    BERT_MODEL_NAME=BERT_MODEL_NAME,
    num_labels=num_labels,
    label_columns=label_columns,
)

Original: https://blog.csdn.net/Checkmate9949/article/details/119893343
Author: Checkmate9949
Title: 【Bert】文本多标签分类

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/532219/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球