NLP之Huggingface抱抱脸实体命名识别NER案例
注:大家觉得博客好的话,别忘了点赞收藏呀,本人每周都会更新关于人工智能和大数据相关的内容,内容多为原创,Python Java Scala SQL 代码,CV NLP 推荐系统等,Spark Flink Kafka Hbase Hive Flume等等~写的都是纯干货,各种顶会的论文解读,一起进步。
Huggingface github:https://github.com/huggingface/
NER任务打标签神奇:https://github.com/doccano/doccano
博学谷IT学习技术支持
文章目录
- NLP之Huggingface抱抱脸实体命名识别NER案例
- 前言
- 一、训练数据集
- 二、读取数据代码
- 三、导入Bert Tokenizer
- 四、标签对齐
- 五、构建数据集
- 六、导入Bert 预训练模型进行微调
- 七、自定义评估标准
- 八、开始训练
- 九、Demo运用
- 总结
前言
Huggingface抱抱脸框架是当下非常流行的自然语言处理的框架,可以做各种自然语言处理任务,本文主要是运用Huggingface解决ner任务。ner任务也是很多其他任务的子任务。以前运用双向lstm加crf较多,本文直接采用bert。
; 一、训练数据集
2 B-year
0 I-year
1 I-year
9 I-year
年 I-year
成 O
人 O
高 B-exam
考 I-exam
招 O
生 O
统 O
一 O
考 B-exam
试 I-exam
时 O
间 O
表 O
数据为上面所示,其中B-year 代表年份开头,I-year代表年份结尾,B-exam代表考试开头,I-exam代表考试结尾,O代表其他。
ps标签工具有很多,本人使用的是开源的doccano。地址已经共享在文章开头。
二、读取数据代码
代码如下(示例):
#这里自己写了一个函数用来读取数据,data_dir改成自己的路径
data_dir = ''
def read_data(file_path):
file_path = Path(file_path)
raw_text = file_path.read_text(encoding='UTF-8').strip()
raw_docs = re.split(r'\n\t?\n', raw_text)
# raw_docs = file_path.read_text().strip()
token_docs = []
tag_docs = []
for doc in raw_docs:
tokens = []
tags = []
for line in doc.split('\n'):
token, tag = line.split(' ')
tokens.append(token)
tags.append(tag)
token_docs.append(tokens)
tag_docs.append(tags)
return token_docs, tag_docs
train_texts, train_tags = read_data(data_dir + '/train.txt')
test_texts, test_tags = read_data(data_dir + '/val.txt')
val_texts, val_tags = read_data(data_dir + '/test.txt')
#unique_tags 代表有多少种标签,tag2id表示每种标签对应的id,id2tag表示每种id对应的标签。后面需要。
unique_tags = set(tag for doc in train_tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}
三、导入Bert Tokenizer
代码如下(示例):
from transformers import AutoTokenizer, BertTokenizerFast
#is_split_into_words表示已经分词好了
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
train_encodings = tokenizer(train_texts, is_split_into_words=True,return_offsets_mapping=True, padding=True, truncation=True,max_length=512)
val_encodings = tokenizer(val_texts, is_split_into_words=True,return_offsets_mapping=True, padding=True, truncation=True,max_length=512)
四、标签对齐
由于需要加上cls和padding,所以需要对标签做对应的处理,格外生成的用-100代替。代码如下
def encode_tags(tags, encodings):
labels = [[tag2id[tag] for tag in doc] for doc in tags]
#print(labels)
encoded_labels = []
for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
# 创建全由-100组成的矩阵
doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
arr_offset = np.array(doc_offset)
# set labels whose first offset position is 0 and the second is not 0
if len(doc_labels) >= 510:#防止异常
doc_labels = doc_labels[:510]
doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
encoded_labels.append(doc_enc_labels.tolist())
return encoded_labels
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)
五、构建数据集
class NerDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_encodings.pop("offset_mapping") # 训练不需要这个
val_encodings.pop("offset_mapping")
train_dataset = NerDataset(train_encodings, train_labels)
val_dataset = NerDataset(val_encodings, val_labels)
六、导入Bert 预训练模型进行微调
这里会先下载预训练模型
num_labels根据自己的任务进行修改,我这里改成5。
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
model = AutoModelForTokenClassification.from_pretrained('ckiplab/albert-base-chinese-ner',num_labels=5,
ignore_mismatched_sizes=True,
id2label=id2tag,
label2id=tag2id
)
七、自定义评估标准
from datasets import load_metric
metric = load_metric("seqeval")
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis=2)
# 不要管-100那些,剔除掉
true_predictions = [
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
results = metric.compute(predictions=true_predictions, references=true_labels)
return {
"precision": results["overall_precision"],
"recall": results["overall_recall"],
"f1": results["overall_f1"],
"accuracy": results["overall_accuracy"],
}
八、开始训练
checkpoint = 'bert-base-chinese'
num_train_epochs = 1000
per_device_train_batch_size=8
per_device_eval_batch_size=8
training_args = TrainingArguments(
output_dir='./output', # 输入路径
num_train_epochs=num_train_epochs, # 训练epoch数量
per_device_train_batch_size=per_device_train_batch_size, # 每个GPU的BATCH
per_device_eval_batch_size=per_device_eval_batch_size,
warmup_steps=500, # warmup次数
weight_decay=0.01, # 限制权重的大小
logging_dir='./logs',
logging_steps=10,
save_strategy='steps',
save_steps=1000,
save_total_limit=1,
evaluation_strategy='steps',
eval_steps=1000
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics
)
trainer.train()
trainer.evaluate()
model.save_pretrained("./checkpoint/model/%s-%sepoch" % (checkpoint, num_train_epochs))
九、Demo运用
这里跑个小的demo,案例是input_str = ‘2009年高考在北京的报名费是2009元’
import torch
import numpy as np
def get_token(input):
english = 'abcdefghijklmnopqrstuvwxyz'
output = []
buffer = ''
for s in input:
if s in english or s in english.upper():
buffer += s
else:
if buffer: output.append(buffer)
buffer = ''
output.append(s)
if buffer: output.append(buffer)
return output
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
model = AutoModelForTokenClassification.from_pretrained('./output/checkpoint-2000')
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
if __name__ == '__main__':
input_str = '2009年高考在北京的报名费是2009元'
input_char = get_token(input_str)
input_tensor = tokenizer(input_char, is_split_into_words=True, padding=True, truncation=True,
return_offsets_mapping=True, max_length=512, return_tensors="pt")
input_tokens = input_tensor.tokens()
offsets = input_tensor["offset_mapping"]
ignore_mask = offsets[0, :, 1] == 0
# print(input_tensor)
input_tensor.pop("offset_mapping") # 不剔除的话会报错
outputs = model(**input_tensor)
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
predictions = outputs.logits.argmax(dim=-1)[0].tolist()
print(predictions)
results = []
tokens = input_tensor.tokens()
idx = 0
while idx < len(predictions):
if ignore_mask[idx]:
idx += 1
continue
pred = predictions[idx]
label = model.config.id2label[pred]
if label != "O":
# 不加B-或者I-
label = label[2:]
start = idx
end = start + 1
# 获取所有token I-label
all_scores = []
all_scores.append(probabilities[start][predictions[start]])
while (
end < len(predictions)
and model.config.id2label[predictions[end]] == f"I-{label}"
):
all_scores.append(probabilities[end][predictions[end]])
end += 1
idx += 1
# 得到是他们平均的
score = np.mean(all_scores).item()
word = input_tokens[start:end]
results.append(
{
"entity_group": label,
"score": score,
"word": word,
"start": start,
"end": end,
}
)
idx += 1
for i in range(len(results)):
print(results[i])
总结
Huggingface提供非常简便的框架,本案例是基于bert的ner任务,以后还会更新gpt等等其他Huggingface案例。感兴趣的可以持续关注。
Original: https://blog.csdn.net/weixin_53280379/article/details/125355146
Author: 陈万君Allen
Title: Huggingface抱抱脸实体命名识别NER案例
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/548780/
转载文章受原作者版权保护。转载请注明原作者出处!