使用Bert_BiLSTM_CRF进行实体或事件序列标注、获取微调后的Bert词向量

序列标注任务

现在很多自然语言处理的任务都被视为基于神经网络的序列标注任务,如实体识别和事件抽取这两个信息抽取中的子任务,序列标注任务就是通过训练和基于概率对文本的每一个字符打上对应的标签,我们就可以对我们想要识别出的文本部分打上特定的标签,可以看成一种编码和解码,自然语言处理的任务跟通信模型非常类似,很多模型都是对文本进行编码之后再解码就能得到对应结果。

本文介绍的是基本的Bert_BiLSTM_CRF的序列标注,其中的Bert是预训练语言模型,主要用来获取文本对应的向量,Bert是在大规模的文本语料上训练,可以很好的表示文本特征。BiLSTM是长短期记忆网络LSTM的变种,可以看成LSTM加了一个双向编码,用来保存长距离的语义信息。CRF则是隐含马尔可夫模型HMM的一个变体,HMM只考虑了前一个状态对当前状态的影响,而CRF考虑了前一个状态和后一个状态,在文本中就是前一个字符对后一个字符的影响,这里的影响可以看成一种条件概率,P(当前字符|前一个字符)。

数据处理

首先要把文本处理成(字 标签)的格式,一个字符和标签占一行,当前句子的最后一个字符与下一个句子的第一个字符中间空一行。处理格式如下:

使用Bert_BiLSTM_CRF进行实体或事件序列标注、获取微调后的Bert词向量
下面给出处理数据的代码utils.py
import os
import numpy as np
import logging
import torch
from torch.utils.data import Dataset
from typing import Tuple, List
from pytorch_pretrained_bert import BertTokenizer

logger = logging.getLogger(__name__)

bert_model = './chinese_L-12_H-768_A-12'
tokenizer = BertTokenizer.from_pretrained(bert_model)

VOCAB=('', '[CLS]', '[SEP]', 'O', 'I', 'B', 'X')

tag2idx = {tag: idx for idx, tag in enumerate(VOCAB)}
idx2tag = {idx: tag for idx, tag in enumerate(VOCAB)}

MAX_LEN = 256 - 2

class NerDataset(Dataset):
    def __init__(self, f_path):
        with open(f_path, 'r', encoding='utf-8') as fr:
            entries = fr.read().strip().split('\n\n')
        sents, tags_li = [], []
        for entry in entries:
            words = [line.split()[0] for line in entry.splitlines()]
            tags = ([line.split()[-1] for line in entry.splitlines()])
            if len(words) > MAX_LEN:

                word, tag = [], []
                for char, t in zip(words, tags):

                    if char != '。':
                        if char != '\ue236':
                            word.append(char)
                            tag.append(t)
                    else:
                        sents.append(["[CLS]"] + word[:MAX_LEN] + ["[SEP]"])
                        tags_li.append(['[CLS]'] + tag[:MAX_LEN] + ['[SEP]'])
                        word, tag = [], []

                if len(word):
                    sents.append(["[CLS]"] + word[:MAX_LEN] + ["[SEP]"])
                    tags_li.append(['[CLS]'] + tag[:MAX_LEN] + ['[SEP]'])
                    word, tag = [], []
            else:
                sents.append(["[CLS]"] + words[:MAX_LEN] + ["[SEP]"])
                tags_li.append(['[CLS]'] + tags[:MAX_LEN] + ['[SEP]'])

        self.sents, self.tags_li = sents, tags_li

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx]
        x, y = [], []
        is_heads = []
        for w, t in zip(words, tags):
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)
            t = [t] + [''] * (len(tokens) - 1)
            yy = [tag2idx[each] for each in t]

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)
        assert len(x)==len(y)==len(is_heads), f"len(x)={len(x)}, len(y)={len(y)}, len(is_heads)={len(is_heads)}"

        seqlen = len(y)

        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen

    def __len__(self):
        return len(self.sents)

def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch]
    x = f(1, maxlen)
    y = f(-2, maxlen)

    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

数据处理就是处理成Bert要输入的格式,主要是对文本的每一个字打上对应的标签。

模型训练

主要分为搭建网络,输入数据进行训练
以下给出搭建网络的模块crf.py,整个网络模块就是bert网络加bilstm网络加一个crf。首先inputs先输入到bert获得输出,这个输出再接入bilstm得到一个新的输出,之后再把这个输出接入crf进行解码就能打上对应的标签了。

import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertModel

def argmax(vec):

    _, idx = torch.max(vec, 1)
    return idx.item()

def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

def log_sum_exp_batch(log_Tensor, axis=-1):
    return torch.max(log_Tensor, axis)[0] + \
        torch.log(torch.exp(log_Tensor-torch.max(log_Tensor, axis)[0].view(log_Tensor.shape[0],-1,1)).sum(axis))

class Bert_BiLSTM_CRF(nn.Module):
    def __init__(self, tag_to_ix, hidden_dim=768):
        super(Bert_BiLSTM_CRF, self).__init__()
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.lstm = nn.LSTM(bidirectional=True, num_layers=2, input_size=768, hidden_size=hidden_dim//2, batch_first=True)
        self.transitions = nn.Parameter(torch.randn(
            self.tagset_size, self.tagset_size
        ))
        self.hidden_dim = hidden_dim
        self.start_label_id = self.tag_to_ix['[CLS]']
        self.end_label_id = self.tag_to_ix['[SEP]']
        self.fc = nn.Linear(hidden_dim, self.tagset_size)
        self.bert = BertModel.from_pretrained('./chinese_L-12_H-768_A-12')
        self.bert.eval()

        self.transitions.data[self.start_label_id, :] = -10000
        self.transitions.data[:, self.end_label_id] = -10000
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        '''
        this also called alpha-recursion or forward recursion, to calculate log_prob of all barX
        '''

        T = feats.shape[1]
        batch_size = feats.shape[0]

        log_alpha = torch.Tensor(batch_size, 1, self.tagset_size).fill_(-10000.).to(self.device)

        log_alpha[:, 0, self.start_label_id] = 0

        for t in range(1, T):
            log_alpha = (log_sum_exp_batch(self.transitions + log_alpha, axis=-1) + feats[:, t]).unsqueeze(1)

        log_prob_all_barX = log_sum_exp_batch(log_alpha)
        return log_prob_all_barX

    def _score_sentence(self, feats, label_ids):
        T = feats.shape[1]
        batch_size = feats.shape[0]

        batch_transitions = self.transitions.expand(batch_size,self.tagset_size,self.tagset_size)
        batch_transitions = batch_transitions.flatten(1)

        score = torch.zeros((feats.shape[0],1)).to(self.device)

        for t in range(1, T):
            score = score + \
                batch_transitions.gather(-1, (label_ids[:, t]*self.tagset_size+label_ids[:, t-1]).view(-1,1)) \
                    + feats[:, t].gather(-1, label_ids[:, t].view(-1,1)).view(-1,1)
        return score

    def _bert_enc(self, x):
"""
        x: [batchsize, sent_len]
        enc: [batch_size, sent_len, 768]
"""
        with torch.no_grad():
            encoded_layer, _  = self.bert(x)
            enc = encoded_layer[-1]
        return enc

    def _viterbi_decode(self, feats):
        '''
        Max-Product Algorithm or viterbi algorithm, argmax(p(z_0:t|x_0:t))
        '''

        T = feats.shape[1]
        batch_size = feats.shape[0]

        log_delta = torch.Tensor(batch_size, 1, self.tagset_size).fill_(-10000.).to(self.device)
        log_delta[:, 0, self.start_label_id] = 0.

        psi = torch.zeros((batch_size, T, self.tagset_size), dtype=torch.long)
        for t in range(1, T):

            log_delta, psi[:, t] = torch.max(self.transitions + log_delta, -1)

            log_delta = (log_delta + feats[:, t]).unsqueeze(1)

        path = torch.zeros((batch_size, T), dtype=torch.long)

        max_logLL_allz_allx, path[:, -1] = torch.max(log_delta.squeeze(), -1)

        for t in range(T-2, -1, -1):

            path[:, t] = psi[:, t+1].gather(-1,path[:, t+1].view(-1,1)).squeeze()

        return max_logLL_allz_allx, path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return torch.mean(forward_score - gold_score)

    def _get_lstm_features(self, sentence):
        """sentence is the ids"""

        embeds = self._bert_enc(sentence)

        enc, _ = self.lstm(embeds)
        lstm_feats = self.fc(enc)
        return lstm_feats

    def forward(self, sentence):

        lstm_feats = self._get_lstm_features(sentence)

        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

以下给出模型训练的模块main.py

import torch
import torch.nn as nn
import torch.optim as optim
import os
import numpy as np
import argparse
from torch.utils import data
from model import Net
from crf import Bert_BiLSTM_CRF
from utils import NerDataset, pad, VOCAB, tokenizer, tag2idx, idx2tag

os.environ['CUDA_VISIBLE_DEVICES'] = '2'

def train(model, iterator, optimizer, criterion, device):
    model.train()
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        x = x.to(device)
        y = y.to(device)
        _y = y
        optimizer.zero_grad()

        loss = model.neg_log_likelihood(x, y)

        loss.backward()
        optimizer.step()

        if i==0:
            print("=====sanity check======")

            print("x:", x.cpu().numpy()[0][:seqlens[0]])

            print("is_heads:", is_heads[0])
            print("y:", _y.cpu().numpy()[0][:seqlens[0]])
            print("tags:", tags[0])
            print("seqlen:", seqlens[0])
            print("=======================")

        if i%10==0:
            print(f"step: {i}, loss: {loss.item()}")

def eval(model, iterator, f, device):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch
            x = x.to(device)

            _, y_hat = model(x)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    count=0
    for sentence_idx in range(len(Y)):
        if Y[sentence_idx]==Y_hat[sentence_idx]:
            count+=1
    sentence_precision=count/len(Y)
    print("这里预测句子的准确度:",sentence_precision)

    with open("temp", 'w', encoding='utf-8') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [idx2tag[hat] for hat in y_hat]
            assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write(f"{w}{t}{p}\n")
            fout.write("\n")

    y_true =  np.array([tag2idx[line.split()[1]] for line in open("temp", 'r', encoding='utf-8').read().splitlines() if len(line) > 0])
    y_pred =  np.array([tag2idx[line.split()[2]] for line in open("temp", 'r', encoding='utf-8').read().splitlines() if len(line) > 0])

    num_proposed = len(y_pred[y_pred>1])
    num_correct = (np.logical_and(y_true==y_pred, y_true>1)).astype(np.int).sum()
    num_gold = len(y_true[y_true>1])

    print(f"num_proposed:{num_proposed}")
    print(f"num_correct:{num_correct}")
    print(f"num_gold:{num_gold}")
    try:
        precision = num_correct / num_proposed
    except ZeroDivisionError:
        precision = 1.0

    try:
        recall = num_correct / num_gold
    except ZeroDivisionError:
        recall = 1.0

    try:
        f1 = 2*precision*recall / (precision + recall)
    except ZeroDivisionError:
        if precision*recall==0:
            f1=1.0
        else:
            f1=0

    final = f + ".P%.2f_R%.2f_F%.2f" %(precision, recall, f1)
    with open(final, 'w', encoding='utf-8') as fout:
        result = open("temp", "r", encoding='utf-8').read()
        fout.write(f"{result}\n")

        fout.write(f"precision={precision}\n")
        fout.write(f"recall={recall}\n")
        fout.write(f"f1={f1}\n")

    os.remove("temp")

    print("precision=%.2f"%precision)
    print("recall=%.2f"%recall)
    print("f1=%.2f"%f1)
    return precision, recall, f1

def get_embbeding(model, iterator,str):
    model.eval()
    model.bert.eval()
    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    emb_list=[]
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch
            x = x.to(device)

            sentence_emb = model._bert_enc(x)

            emb_list.append(np.array(sentence_emb.cpu().numpy().tolist()))

    pickle_file = open(str, "wb")
    import pickle
    print(emb_list)
    pickle.dump(emb_list,pickle_file)
    print("加载完成!")

if __name__=="__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("--batch_size", type=int, default=20)
    parser.add_argument("--lr", type=float, default=0.0001)
    parser.add_argument("--n_epochs", type=int, default=100)
    parser.add_argument("--finetuning", dest="finetuning", action="store_true")
    parser.add_argument("--top_rnns", dest="top_rnns", action="store_true")
    parser.add_argument("--logdir", type=str, default="checkpoints/01")
    parser.add_argument("--trainset", type=str, default="./processed/ace事件抽取/new_train.txt")
    parser.add_argument("--validset", type=str, default="./processed/ace事件抽取/new_eval.txt")
    parser.add_argument("--Train",type=str, default="./processed/ace事件抽取/new_train.txt")
    parser.add_argument("--Test", type=str, default="./processed/ace事件抽取/new_test.txt")
    parser.add_argument("--Dev", type=str, default="./processed/ace事件抽取/new_dev.txt")
    hp = parser.parse_args()

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model = Bert_BiLSTM_CRF(tag2idx).cuda()
    print('Initial model Done')

    train_dataset = NerDataset(hp.trainset)
    eval_dataset = NerDataset(hp.validset)

    emb_train_dataset=NerDataset(hp.Train)
    emb_dev_dataset = NerDataset(hp.Dev)
    emb_test_dataset = NerDataset(hp.Test)

    print('Load Data Done')

    train_iter = data.DataLoader(dataset=train_dataset,
                                 batch_size=hp.batch_size,
                                 shuffle=True,
                                 num_workers=4,
                                 collate_fn=pad)
    eval_iter = data.DataLoader(dataset=eval_dataset,
                                 batch_size=hp.batch_size,
                                 shuffle=False,
                                 num_workers=4,
                                 collate_fn=pad)

    emb_train_iter=data.DataLoader(dataset=emb_train_dataset,
                                 batch_size=hp.batch_size,
                                 shuffle=False,
                                 num_workers=4,
                                 collate_fn=pad)
    emb_dev_iter = data.DataLoader(dataset=emb_dev_dataset,
                                     batch_size=hp.batch_size,
                                     shuffle=False,
                                     num_workers=4,
                                     collate_fn=pad)
    emb_test_iter = data.DataLoader(dataset=emb_test_dataset,
                                     batch_size=hp.batch_size,
                                     shuffle=False,
                                     num_workers=4,
                                     collate_fn=pad)

    optimizer = optim.Adam(model.parameters(), lr = hp.lr)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    print('Start Train...,')
    for epoch in range(1, hp.n_epochs+1):

        train(model, train_iter, optimizer, criterion, device)

        print(f"=========eval at epoch={epoch}=========")
        if not os.path.exists(hp.logdir): os.makedirs(hp.logdir)
        fname = os.path.join(hp.logdir, str(epoch))
        precision, recall, f1 = eval(model, eval_iter, fname, device)

        torch.save(model.state_dict(), f"{fname}.pt")

        torch.save(model.bert, "./FINETUNED_BERT_ENCODER_PATH")
        print(f"weights were saved to {fname}.pt")

    get_embbeding(model, emb_train_iter,'./data_emb/train_emb')
    get_embbeding(model,emb_dev_iter,'./data_emb/dev_emb')
    get_embbeding(model, emb_test_iter, './data_emb/test_emb')

获取微调bert后的词向量

用序列标注微调后的bert获取的词向量对当前文本在其他任务上的效果会有提升。获取词向量的方法已经在上面代码给出,将需要获取词向量的文本处理成训练文本的格式,在输入模型即可。


def get_embbeding(model, iterator,str):
    model.eval()
    model.bert.eval()
    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    emb_list=[]
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch
            x = x.to(device)

            sentence_emb = model._bert_enc(x)

            emb_list.append(np.array(sentence_emb.cpu().numpy().tolist()))

    pickle_file = open(str, "wb")
    import pickle
    print(emb_list)
    pickle.dump(emb_list,pickle_file)
    print("加载完成!")

数据的处理跟之前的训练数据和测试数据一样


    get_embbeding(model, emb_train_iter,'./data_emb/train_emb')
    get_embbeding(model,emb_dev_iter,'./data_emb/dev_emb')
    get_embbeding(model, emb_test_iter, './data_emb/test_emb')

总结

自己之前的一个事件触发词识别的任务需要用到Bert来获取词向量,采用了很多不同Bert版本,但是没有经过微调,效果都不怎样,因为自己的任务方式不能接入bert的接口,不能直接加bert。所以我就先用这个模型对我的数据集做序列标注任务,即标出事件触发词,再获取微调后的词向量,对我的识别任务有了很大提升。这里需要重复一点,我用序列标注来识别触发词时,整个文本的标注准确率挺高的,但要是用标出的触发词来计算准确率的话效果就比较差,可能时空标注O的占比很大,触发词的部分标注可能有个别差错而影响了准确率。我也有看到一篇用序列标注来做触发词识别的顶会,论文中的句子表示采用了字符级别向量、词级别向量、位置级别向量、Bert获取的向量的混合表示,输入LSTM再经过CRF解码后取得不错的效果。需要完整项目的请邮箱联系我974128464@qq.com

Original: https://blog.csdn.net/weixin_44305190/article/details/120055901
Author: 西南叶孤城
Title: 使用Bert_BiLSTM_CRF进行实体或事件序列标注、获取微调后的Bert词向量

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/545111/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球