# !/usr/bin/env Python3
# -*- coding: utf-8 -*-
# @version: v1.0
# @Author : Meng Li
# @contact: 925762221@qq.com
# @FILE : Torch_bert.py
# @Time : 2022/7/7 14:32
# @Software : PyCharm
# @site:
# @Description : 自己实现的Bert模型
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import re
import random
import numpy as np
import math
text = (
'Hello, how are you? I am Romeo.\n' # R
'Hello, Romeo My name is Juliet. Nice to meet you.\n' # J
'Nice meet you too. How are you today?\n' # R
'Great. My baseball team won the competition.\n' # J
'Oh Congratulations, Juliet\n' # R
'Thank you Romeo\n' # J
'Where are you going today?\n' # R
'I am going shopping. What about you?\n' # J
'I am going to visit my grandmother. she is not very well' # R
)
sentence = re.sub("[,.!?\\-]", "", text.lower()).split("\n") # 去除字符串中的".,!?-"
vocab = " ".join([i for i in sentence])
vocab = list(set([i for i in vocab.split(" ")]))
word2idx = {'MASK': 0, 'CLS': 1, 'SEQ': 2, 'PAD': 3}
for i in range(len(vocab)):
word2idx[vocab[i]] = i + 4
idx2word = {i: j for i, j in enumerate(word2idx)}
vocab_size = len(idx2word)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
token_list = []
for i in range(len(sentence)):
token_list.append([word2idx[j] for j in sentence[i].split(" ")])
max_len = 30 # 句子最长长度
num_pred = 5 # 最长掩码长度
batch_size = 6 # batch大小
n_layers = 6
embedding_size = 768 # 向量的embed维度
segments_len = 2
embed_size = 768
dim = 64
num_heads = 12
d_ff = 64
dropout = 0.5
class my_dataset(Dataset):
def __init__(self, input_ids, segment_ids, masked_pos, masked_tokens, isNext):
super().__init__()
self.input_ids = input_ids
self.segment_ids = segment_ids
self.masked_pos = masked_pos
self.masked_tokens = masked_tokens
self.isNext = isNext
def __getitem__(self, index):
return self.input_ids[index], self.segment_ids[index], self.masked_pos[index], self.masked_tokens[index], \
self.isNext[index]
def __len__(self):
return self.input_ids.size(0)
def make_data(seq_data):
"""
:param seq_data:
:return: 返回 [input_ids, segment_ids, masked_tokens, masked_pos, isNext]
"""
batch = []
left_cnt = right_cnt = 0
while left_cnt <= 0 2 batch_size or right_cnt <="batch_size" 2: rand_a_idx="rand_b_idx" = sen_a_idx="random.randrange(len(seq_data))" sen_b_idx="random.randrange(len(seq_data))" tokens_a="seq_data[sen_a_idx]" tokens_b="seq_data[sen_b_idx]" # int型 和 list 相加可以采用这种方式 input_ids="[word2idx['CLS']]" + [word2idx['seq']] segment_ids="[0]" * (1 len(tokens_a) 1) [1] (len(tokens_b) n_pred="min(num_pred," int(len(input_ids) 0.15)) 将一句话中15%的词替换成其他内容 test_input_ids="[i" for i, j in enumerate(input_ids) if idx2word[j] !="CLS" and ] random.shuffle(test_input_ids) 将input_ids随机打乱 masked_tokens, masked_pos="[]," [] word range(n_pred): cand_rep_idx="test_input_ids[word]" masked_pos.append(cand_rep_idx) masked_tokens.append(input_ids[cand_rep_idx]) p="random.random()"> 0.8:
input_ids[cand_rep_idx] = word2idx['MASK']
elif p > 0.1:
other_idx = random.randrange(len(input_ids))
input_ids[cand_rep_idx] = input_ids[other_idx]
else:
input_ids[cand_rep_idx] = input_ids[word]
n_pad = max_len - len(input_ids)
input_ids.extend(n_pad * [0])
segment_ids.extend(n_pad * [0])
if num_pred > n_pred:
n_pad = num_pred - n_pred
masked_pos.extend(n_pad * [0])
masked_tokens.extend(n_pad * [0])
if sen_a_idx + 1 != sen_b_idx and left_cnt <= 1 batch_size 2: isnext="False" left_cnt="left_cnt" + batch.append([input_ids, segment_ids, masked_pos, masked_tokens, isnext]) elif sen_a_idx sen_b_idx and right_cnt <="batch_size" return batch class scaleddotproductattention(nn.module): def __init__(self): super(scaleddotproductattention, self).__init__() forward(self, q, k, v, attn_mask): """ q: [batch_size, n_heads, len_q, d_k] k: len_k, v: len_v(="len_k)," d_v] attn_mask: seq_len, seq_len] 这里求取注意力相似度采用的是可缩放点积,普通的点乘,方差会很大,反向传播时梯度会变小 scores="torch.matmul(Q," k.transpose(-1, -2)) np.sqrt(dim) # : len_k] 将矩阵scores中attn_mask为true时对应的元素索引置-1e9 scores.masked_fill_(attn_mask, -1e9) fills elements of self tensor with value where mask is true. 0) scores) attn="nn.Softmax(dim=-1)(scores)" len_q] atten_mask 与 v 相乘得到经过masked后的注意力矩阵 context="torch.matmul(attn," v) multi_head_attention(nn.module): super().__init__() self.w_q="nn.Linear(embed_size," dim * num_heads, bias="False)" 将输入矩阵映射为低维度 self.w_k="nn.Linear(embed_size," self.w_v="nn.Linear(embed_size," self.projection="torch.nn.Linear(num_heads" dim, embed_size) 将atten的维度转换为与输入的维度一致 input_q, input_k, input_v, atten_mask): :param input_q: -> [Batch_size, len_q, embedding_size]
:param input_K: -> [Batch_size, len_k, embedding_size]
:param input_V: -> [Batch_size, len_v(=len_k), embedding_size]
:param atten_mask: -> [Batch_size, atten_len_k, atten_len_v]
:return: 这里的dim是QKV矩阵的维度
# 对输入求得Q、K、V三个矩阵,然后根据Q和K矩阵求得注意力矩阵,最后根据注意力矩阵求得经过Masked后的注意力矩阵
# 返回的enc_inputs 和 atten 张量维度的一样的
"""
torch.backends.cudnn.enabled = False
residual = input_Q # [Batch_size, len_q, embedding_size] 这里是残差项,多层注意力的输出与此项相加
_, len_q, embedding_size = input_Q.size()
_, len_k, _ = input_K.size()
Batch_size, atten_len_k, atten_len_v = atten_mask.size()
# 输入乘以矩阵得到Q、K、V矩阵
Q = self.W_Q(input_Q).view(Batch_size, num_heads, len_q, dim) # Q -> [Batch_size, len_q, dim*num_heads]
K = self.W_K(input_K).view(Batch_size, num_heads, len_k, dim) # K -> [Batch_size, len_k, dim*num_heads]
V = self.W_V(input_V).view(Batch_size, num_heads, len_k, dim) # V -> [Batch_size, len_v, dim*num_heads]
atten_mask = atten_mask.unsqueeze(1) # atten_mask -> [Batch_size, 1, atten_len_k, atten_len_v]
# atten_mask -> [Batch_size, num_heads, atten_len_k, atten_len_v] 这里的 atten_len_v == len_q
atten_mask = atten_mask.repeat(1, num_heads, 1, 1)
atten = ScaledDotProductAttention()(Q, K, V, atten_mask)
atten = atten.transpose(1, 2) # atten -> [Batch_size, atten_len_k, num_heads, dim]
atten = atten.reshape(Batch_size, atten_len_k, -1) # atten -> [Batch_size, atten_len_k, num_heads * dim]
atten = self.projection(atten).to(device) # atten -> [Batch_size, atten_len_k, embed_size] atten_len_k == len_q
# softmax 不改变矩阵的维度,这里对行方向对行向量进行归一化 这里对输出和残差 进行Add && Norm 操作
atten_ret = (residual + torch.softmax(atten, dim=1))
atten_ret = nn.LayerNorm(embed_size).to(device)(atten_ret)
return atten_ret
class Feed_forward(nn.Module):
"""
对应于原论文中的Feed-Forward流程
查看某个数据是否存储于cuda上,可键入命令: x.is_cuda 其中x为变量
"""
def __init__(self):
super().__init__()
self.W1 = nn.Linear(embed_size, d_ff).to(device)
self.W2 = nn.Linear(d_ff, embed_size).to(device)
self.b1 = torch.rand(d_ff).to(device)
self.b2 = torch.rand(embed_size).to(device)
self.relu = nn.ReLU().to(device)
self.dropout = nn.Dropout(p=dropout)
def forward(self, enc_inputs):
"""
:param enc_inputs: # enc_inputs -> [Batch_size, seq_len, embedding_size]
# atten -> [Batch_size, seq_len, embedding_size]
:return:
"""
fc1 = self.W1(enc_inputs) + self.b1
fc1 = self.relu(fc1)
fc2 = self.W2(fc1) + self.b2 # fc2 -> [Batch_size, seq_len, embedding_size]
output = fc2 # output -> [Batch_size, seq_len, embedding_size]
residual = enc_inputs
Add_And_Norm = nn.LayerNorm(embed_size).cuda()(output + residual)
return Add_And_Norm
class Encoder_layer(nn.Module):
def __init__(self):
super().__init__()
self.multi_head_attention = Multi_Head_Attention()
self.feed_forward = Feed_forward()
def forward(self, enc_inputs, enc_atten_mask):
"""
:param enc_inputs: # enc_inputs -> [Batch_size, src_len, embedding_size]
:param enc_atten_mask: # enc_atten_mask -> [Batch_size, src_len, src_len]
:return:
"""
# 传入多层注意力机制的输入Q、K、V 都假定为一样的
atten_output = self.multi_head_attention(enc_inputs, enc_inputs, enc_inputs, enc_atten_mask) # 这里得到的是注意力矩阵
output = self.feed_forward(atten_output).to(device) # output -> [Batch_size, seq_len, embeded_size]
return output, atten_output
def get_attn_pad_mask(seq_q, seq_k):
"""
:param seq_q: seq_q -> [Batch_size, len_q]
:param seq_k: seq_k -> [Batch_size, len_k]
:return:
"""
Batch_size, len_q = seq_q.size()
Batch_size, len_k = seq_k.size()
atten_mask = seq_k.eq(0).unsqueeze(1) # atten_mask -> [Batch_size, 1, len_k]
atten_mask = atten_mask.expand(Batch_size, len_q, len_k) # atten_mask -> [Batch_size, len_q, len_k]
return atten_mask
def gelu(x):
"""
Implementation of the gelu activation function.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415
论文中用GELU代替了RELU
"""
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
class BERT(nn.Module):
def __init__(self):
super().__init__()
self.token_embed = torch.nn.Embedding(vocab_size, embedding_size).to(device) # 对token进行向量化
self.pos_embed = torch.nn.Embedding(max_len, embedding_size).to(device) # 对position进行向量化
self.seg_embed = torch.nn.Embedding(segments_len, embedding_size).to(device) # 对不同的语句进行向量化
self.layers = nn.ModuleList(Encoder_layer() for _ in range(n_layers))
self.fc1 = nn.Sequential(
nn.Linear(embedding_size, embedding_size),
nn.Dropout(0.5),
nn.Tanh(),
)
self.classifier = nn.Linear(embedding_size, 2) # 对两句话进行分类,isNext or not is Next
self.fc2 = nn.Linear(embedding_size, vocab_size)
self.linear = nn.Linear(embedding_size, embedding_size)
def forward(self, input_token, segments_, masked_pos):
"""
:param masked_pos: [Batch_size, n_pred]
:param input_token: [Batch_size, seq_len]
:param segments_: [Batch_size, seq_len]
:return:
"""
Batch_size, seq_len = input_token.size()
pos = torch.arange(seq_len, dtype=torch.long) # [seq_len]
pos = pos.unsqueeze(0) # [1, seq_len]
pos = pos.repeat(Batch_size, 1).to(device) # [Batch_size, seq_len]
# input_token_embed -> [Batch_size, seq_len, embedding_size]
input_token_embed = self.token_embed(input_token) + self.seg_embed(segments_) + self.pos_embed(pos)
enc_atten_mask = get_attn_pad_mask(input_token, input_token) # [Batch_size, seq_len, seq_len]
output = input_token_embed
for layer in self.layers:
output, _ = layer(output, enc_atten_mask) # output [Batch_size, seq_len, embedding_size]
_, seq_len, _ = output.size()
nsp_output = output
nsp_output = self.fc1(nsp_output) # [Batch_size, seq_len, embedding_size]
nsp_output = self.classifier(nsp_output) # [Batch_size, seq_len, 2]
nsp_output = torch.sum(nsp_output.transpose(2, 1), dim=-1) # [Batch_size, 2]
nsp_output = torch.softmax(nsp_output, dim=-1) # [Batch_size, 2]
masked_pos = masked_pos.unsqueeze(-1).repeat(1, 1, embedding_size) # [Batch_size, n_pred, embedding_size]
nlm_output = torch.gather(output, 1, masked_pos)
nlm_output = gelu(self.linear(nlm_output)) # [Batch_size, n_pred, embedding_size]
nlm_output = self.fc2(nlm_output) # [Batch_size, n_pred, vocab_size]
return nsp_output, nlm_output
def train():
batch = make_data(token_list)
input_ids, segment_ids, masked_pos, masked_tokens, isNext = zip(*batch)
input_ids, segment_ids, masked_pos, masked_tokens, isNext = torch.LongTensor(input_ids), torch.LongTensor(
segment_ids), torch.LongTensor(masked_pos), torch.LongTensor(masked_tokens), torch.LongTensor(isNext)
train_data = my_dataset(input_ids, segment_ids, masked_pos, masked_tokens, isNext)
train_iter = DataLoader(train_data, batch_size, shuffle=True)
crition = torch.nn.CrossEntropyLoss() # 括号里面的参数顺序应该是(label, target)
model = BERT().train()
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
for step in range(1000):
for input_ids_i, segment_ids_i, masked_pos_i, masked_tokens_i, isNext_i in train_iter:
input_ids_i, segment_ids_i, masked_pos_i, masked_tokens_i, isNext_i = input_ids_i.to(device), \
segment_ids_i.to(device), \
masked_pos_i.to(device), \
masked_tokens_i.to(device), \
isNext_i.to(device)
optimizer.zero_grad()
nsp_out, nlm_out = model(input_ids_i, segment_ids_i, masked_pos_i)
classify_loss = crition(nsp_out, isNext_i)
masked_tokens_i = masked_tokens_i.view(-1)
nlm_out = nlm_out.view(-1, vocab_size)
nlm_out = nlm_out
nlm_loss = 0
nlm_loss = crition(nlm_out, masked_tokens_i)
nlm_loss = nlm_loss.mean()
loss = nlm_loss + classify_loss
loss.backward()
optimizer.step()
if step % 100 == 0:
print("step {0} loss {1} loss {2}".format(step, nlm_loss, classify_loss))
if __name__ == '__main__':
train()
</=></=>
目前NLP系列最后一道坎,Bert模型
Bert采用了Transformers模型中的Encoder模型,这里有6层Encoder。每层Encoder有12层多层注意力层(Multi-Head Attention)。
其中,输入语料库依次为9个句子。我在这里使用这九句话作为一个小数据集。
[En]
Among them, the input corpus is 9 sentences in sequence. I use these nine sentences as a small data set here.
随机在语料库中选取两句话,如果这两句话有在整个文档中有先后顺序,那么isNext 字段为True
对于选取的两句话中的每一句话,随机Mask其中的某几个token,token就是字符级别的。
masked的方式有三种,80%的可能性将其变成'[MASKED]’,10% 的可能性将其变成其他的token,还剩下10%的可能性将其保持不变。
这样做的好处就是,训练模型能够根据上下文预测当前被Masked的token,灵盖是由Word2Vec的EBOW激发而来。因为EBOW也是由一个词周边的词语预测当前词。
至于要将剩下的10%的token 不进行任何处理,是因为下游任务中是没有训练集,也就是没有’MASKED’,为了让模型能够适配下游任务,这里做了相应的处理。
if p > 0.8:
input_ids[cand_rep_idx] = word2idx['MASK']
elif p > 0.1:
other_idx = random.randrange(len(input_ids))
input_ids[cand_rep_idx] = input_ids[other_idx]
else:
input_ids[cand_rep_idx] = input_ids[word]
训练集中的每一条语句有三部分相加而成。input_ids, segment_ids, isNext
input_ids的构造方式上文讲了,segment_ids是为了区分两句话中,之间的分割
eg:segment_ids [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
将[input_ids , segment_ids, isNext] 字段经过一个Embedding层,得到一个大小为[Batch_size, seq_len, embedding_size]大小的张量。这里的embeding层中对pos位置的处理方法相对于Transformer有点不同,这里采用的是一个Embedding的矩阵来表示位置,该矩阵是可以学习的。
pos = torch.arange(seq_len, dtype=torch.long) # [seq_len]
pos = pos.unsqueeze(0) # [1, seq_len]
pos = pos.repeat(Batch_size, 1).to(device) # [Batch_size, seq_len]
张量经过num_layers 个Encoder层,输出接两个全连接层,网络设计完毕
模型的loss 分两个,一个是MLM的loss 一个是NSP的loss
MLM的loss 仅仅针对被masked的token的loss(nlm_loss),NSP的loss是对句向量进行分类的loss(nsp_loss)。但是由于nsp_loss 太小, nlm_loss 对整个loss的影响太大,nsp_loss下降不是很明显
Original: https://blog.csdn.net/linxizi0622/article/details/125710630
Author: linxizi0622
Title: 基于Pytorch+Bert的预训练模型
相关阅读
Title: pandas中的corr()_在Python中使用Pandas
Pandas是一个python库,用于处理数据、生成统计数据、聚合数据等等。在这篇文章中,我们将讨论如何使用Pandas库进行数据选择、聚合和统计分析。
我们开始吧!
我们将使用银行客户流失对数据集进行建模。数据可以在这里找到。
[En]
We will use the bank customer churn to model the dataset. The data can be found here.
https://www.kaggle.com/sanjanavoona1043/bank-churn
首先,我们导入Pandas库,打印前五行数据:
import pandas as pddf = pd.read_csv("Bank_churn_modelling.csv")pd.set_option('display.max_columns', None)pd.set_option('display.max_rows', None)print(df.head())
数据选择
这里我们将考虑使用Pandas数据帧进行数据选择。我们可以使用”[]”来选择数据列。例如,如果要选择”CreditScore”、”Gender”、”Age”和”Exited”,可以执行以下操作:
df_select = df[['CreditScore', 'Gender', 'Age',  'Exited']]print(df_select.head())
您还可以按列值筛选原始数据框。让我们筛选原始数据,以仅包括40岁以上的客户:
[En]
You can also filter raw data frames by column values. Let’s filter the raw data to include only customers over the age of 40:
df_age_gt_40 = df[df['Age'] > 40]print(df_age_gt_40.head())
我们还可以筛选40岁或以下的客户:
df_age_lte_40 = df[df['Age'] <=40]print(df_age_lte_40.head())< code></=40]print(df_age_lte_40.head())<>
或者只有40岁的客户:
df_age_e_40 = df[df['Age'] ==40]print(df_age_e_40.head())
我们也可以按类别过滤。例如,我们可以选择”Geography”为France的数据:
df_france = df[df['Geography'] == 'France']print(df_france.head())
还可以使用”.loc[]”运算符来完成相同的任务:
df_france_loc = df.loc[df.Geography == 'France']print(df_france_loc.head())
我们还可以过滤多个分类值。我们只考虑来自德国和西班牙的客户:
[En]
We can also filter multiple classification values. We only consider customers from Germany and Spain:
geography_list = ['Germany', 'Spain']df_germany_spain = df[df['Geography'].isin(geography_list)]print(df_germany_spain.head())
统计
我们也可以用Pandas来生成一些基本的统计数据。例如,如果我们想计算信用评分的平均值和标准差,可以执行以下操作:
mean_credit_score = df['CreditScore'].mean()print('Mean credit Score: ', mean_credit_score)std_credit_score = df['CreditScore'].std()print('Standard Deviation in Credit Score: ', std_credit_score)
您还可以查看最小值和最大值:
[En]
You can also view the minimum and maximum values:
min_credit_score = df['CreditScore'].min()print('Min credit Score: ', min_credit_score)max_credit_score = df['CreditScore'].max()print('Standard Credit Score: ', max_credit_score)
您还可以计算要素的相关性并绘制热图。看看“年龄”、“信用评分”、“估计资历”和“任期”之间的相关性:
[En]
You can also calculate the correlation of features and draw heat maps. Take a look at the correlation between “age”, “credit score”, “estimated seniority” and “tenure”:
corr = df[['Age', 'CreditScore', 'EstimatedSalary', 'Tenure']].corr()print(corr)
可以使用seaborn绘制相关值的热图:
import seaborn as sns sns.heatmap(corr)
数据聚合
我们也可以用Pandas来收集数据。具体来说,可以使用groupby方法来生成类别级别的统计信息。例如,我们可以从原始数据生成一个新的dataframe,其中包含每个”Geography”值的平均”CreditScore”。这将使我们能够分析和比较法国、西班牙和德国的平均信用评分:
df_groupby_mean = df.groupby('Geography')['CreditScore'].mean()print(df_groupby_mean.head())
您还可以查看信用评分的标准差:
[En]
You can also view the standard deviation of the credit score:
df_groupby_std = df.groupby('Geography')['CreditScore'].std()print(df_groupby_std.head())
让我们看看另一个数字列。让我们计算一下每个国家的平均年龄和年龄标准差:
[En]
Let’s look at another number column. Let’s calculate the average age and age standard deviation for each country:
df_groupby_age_mean = df.groupby('Geography')['Age'].mean()print(df_groupby_age_mean.head())df_groupby_age_std = df.groupby('Geography')['Age'].std()print(df_groupby_age_std.head())
我们还可以按多列进行分组:
[En]
We can also group by multiple columns:
df_groupby_multiple_category = df.groupby(['Geography', 'Gender'])['Age'].mean()print(df_groupby_multiple_category.head())
结论
本文讨论了如何与Pandas库一起执行数据选择、数据聚合和统计分析等任务。
Original: https://blog.csdn.net/weixin_34738099/article/details/113538112
Author: 宫乘风
Title: pandas中的corr()_在Python中使用Pandas
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/262777/
转载文章受原作者版权保护。转载请注明原作者出处!