基于Pytorch+Bert的预训练模型

# !/usr/bin/env Python3
# -*- coding: utf-8 -*-
# @version: v1.0
# @Author   : Meng Li
# @contact: 925762221@qq.com
# @FILE     : Torch_bert.py
# @Time     : 2022/7/7 14:32
# @Software : PyCharm
# @site:
# @Description :  自己实现的Bert模型
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import re
import random
import numpy as np
import math

text = (
    'Hello, how are you? I am Romeo.\n'  # R
    'Hello, Romeo My name is Juliet. Nice to meet you.\n'  # J
    'Nice meet you too. How are you today?\n'  # R
    'Great. My baseball team won the competition.\n'  # J
    'Oh Congratulations, Juliet\n'  # R
    'Thank you Romeo\n'  # J
    'Where are you going today?\n'  # R
    'I am going shopping. What about you?\n'  # J
    'I am going to visit my grandmother. she is not very well'  # R
)
sentence = re.sub("[,.!?\\-]", "", text.lower()).split("\n")  # 去除字符串中的".,!?-"
vocab = " ".join([i for i in sentence])
vocab = list(set([i for i in vocab.split(" ")]))
word2idx = {'MASK': 0, 'CLS': 1, 'SEQ': 2, 'PAD': 3}
for i in range(len(vocab)):
    word2idx[vocab[i]] = i + 4
idx2word = {i: j for i, j in enumerate(word2idx)}
vocab_size = len(idx2word)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

token_list = []
for i in range(len(sentence)):
    token_list.append([word2idx[j] for j in sentence[i].split(" ")])

max_len = 30  # 句子最长长度
num_pred = 5  # 最长掩码长度
batch_size = 6  # batch大小
n_layers = 6
embedding_size = 768  # 向量的embed维度
segments_len = 2
embed_size = 768
dim = 64
num_heads = 12
d_ff = 64
dropout = 0.5

class my_dataset(Dataset):
    def __init__(self, input_ids, segment_ids, masked_pos, masked_tokens, isNext):
        super().__init__()
        self.input_ids = input_ids
        self.segment_ids = segment_ids
        self.masked_pos = masked_pos
        self.masked_tokens = masked_tokens
        self.isNext = isNext

    def __getitem__(self, index):
        return self.input_ids[index], self.segment_ids[index], self.masked_pos[index], self.masked_tokens[index], \
               self.isNext[index]

    def __len__(self):
        return self.input_ids.size(0)

def make_data(seq_data):
"""
    :param seq_data:
    :return: 返回 [input_ids, segment_ids, masked_tokens, masked_pos, isNext]
"""
    batch = []
    left_cnt = right_cnt = 0
    while left_cnt <= 0 2 batch_size or right_cnt <="batch_size" 2: rand_a_idx="rand_b_idx" = sen_a_idx="random.randrange(len(seq_data))" sen_b_idx="random.randrange(len(seq_data))" tokens_a="seq_data[sen_a_idx]" tokens_b="seq_data[sen_b_idx]" # int型 和 list 相加可以采用这种方式 input_ids="[word2idx['CLS']]" + [word2idx['seq']] segment_ids="[0]" * (1 len(tokens_a) 1) [1] (len(tokens_b) n_pred="min(num_pred," int(len(input_ids) 0.15)) 将一句话中15%的词替换成其他内容 test_input_ids="[i" for i, j in enumerate(input_ids) if idx2word[j] !="CLS" and ] random.shuffle(test_input_ids) 将input_ids随机打乱 masked_tokens, masked_pos="[]," [] word range(n_pred): cand_rep_idx="test_input_ids[word]" masked_pos.append(cand_rep_idx) masked_tokens.append(input_ids[cand_rep_idx]) p="random.random()"> 0.8:
                input_ids[cand_rep_idx] = word2idx['MASK']
            elif p > 0.1:
                other_idx = random.randrange(len(input_ids))
                input_ids[cand_rep_idx] = input_ids[other_idx]
            else:
                input_ids[cand_rep_idx] = input_ids[word]

        n_pad = max_len - len(input_ids)
        input_ids.extend(n_pad * [0])
        segment_ids.extend(n_pad * [0])

        if num_pred > n_pred:
            n_pad = num_pred - n_pred
            masked_pos.extend(n_pad * [0])
            masked_tokens.extend(n_pad * [0])

        if sen_a_idx + 1 != sen_b_idx and left_cnt <= 1 batch_size 2: isnext="False" left_cnt="left_cnt" + batch.append([input_ids, segment_ids, masked_pos, masked_tokens, isnext]) elif sen_a_idx sen_b_idx and right_cnt <="batch_size" return batch class scaleddotproductattention(nn.module): def __init__(self): super(scaleddotproductattention, self).__init__() forward(self, q, k, v, attn_mask): """ q: [batch_size, n_heads, len_q, d_k] k: len_k, v: len_v(="len_k)," d_v] attn_mask: seq_len, seq_len] 这里求取注意力相似度采用的是可缩放点积,普通的点乘,方差会很大,反向传播时梯度会变小 scores="torch.matmul(Q," k.transpose(-1, -2)) np.sqrt(dim) # : len_k] 将矩阵scores中attn_mask为true时对应的元素索引置-1e9 scores.masked_fill_(attn_mask, -1e9) fills elements of self tensor with value where mask is true. 0) scores) attn="nn.Softmax(dim=-1)(scores)" len_q] atten_mask 与 v 相乘得到经过masked后的注意力矩阵 context="torch.matmul(attn," v) multi_head_attention(nn.module): super().__init__() self.w_q="nn.Linear(embed_size," dim * num_heads, bias="False)" 将输入矩阵映射为低维度 self.w_k="nn.Linear(embed_size," self.w_v="nn.Linear(embed_size," self.projection="torch.nn.Linear(num_heads" dim, embed_size) 将atten的维度转换为与输入的维度一致 input_q, input_k, input_v, atten_mask): :param input_q: -> [Batch_size, len_q, embedding_size]
        :param input_K: -> [Batch_size, len_k, embedding_size]
        :param input_V: -> [Batch_size, len_v(=len_k), embedding_size]
        :param atten_mask:  -> [Batch_size, atten_len_k, atten_len_v]
        :return: &#x8FD9;&#x91CC;&#x7684;dim&#x662F;QKV&#x77E9;&#x9635;&#x7684;&#x7EF4;&#x5EA6;
        # &#x5BF9;&#x8F93;&#x5165;&#x6C42;&#x5F97;Q&#x3001;K&#x3001;V&#x4E09;&#x4E2A;&#x77E9;&#x9635;&#xFF0C;&#x7136;&#x540E;&#x6839;&#x636E;Q&#x548C;K&#x77E9;&#x9635;&#x6C42;&#x5F97;&#x6CE8;&#x610F;&#x529B;&#x77E9;&#x9635;&#xFF0C;&#x6700;&#x540E;&#x6839;&#x636E;&#x6CE8;&#x610F;&#x529B;&#x77E9;&#x9635;&#x6C42;&#x5F97;&#x7ECF;&#x8FC7;Masked&#x540E;&#x7684;&#x6CE8;&#x610F;&#x529B;&#x77E9;&#x9635;
        # &#x8FD4;&#x56DE;&#x7684;enc_inputs &#x548C; atten &#x5F20;&#x91CF;&#x7EF4;&#x5EA6;&#x7684;&#x4E00;&#x6837;&#x7684;
"""
        torch.backends.cudnn.enabled = False
        residual = input_Q  # [Batch_size, len_q, embedding_size]  &#x8FD9;&#x91CC;&#x662F;&#x6B8B;&#x5DEE;&#x9879;&#xFF0C;&#x591A;&#x5C42;&#x6CE8;&#x610F;&#x529B;&#x7684;&#x8F93;&#x51FA;&#x4E0E;&#x6B64;&#x9879;&#x76F8;&#x52A0;
        _, len_q, embedding_size = input_Q.size()
        _, len_k, _ = input_K.size()
        Batch_size, atten_len_k, atten_len_v = atten_mask.size()
        # &#x8F93;&#x5165;&#x4E58;&#x4EE5;&#x77E9;&#x9635;&#x5F97;&#x5230;Q&#x3001;K&#x3001;V&#x77E9;&#x9635;
        Q = self.W_Q(input_Q).view(Batch_size, num_heads, len_q, dim)  # Q -> [Batch_size, len_q, dim*num_heads]
        K = self.W_K(input_K).view(Batch_size, num_heads, len_k, dim)  # K -> [Batch_size, len_k, dim*num_heads]
        V = self.W_V(input_V).view(Batch_size, num_heads, len_k, dim)  # V -> [Batch_size, len_v, dim*num_heads]

        atten_mask = atten_mask.unsqueeze(1)  # atten_mask -> [Batch_size, 1, atten_len_k, atten_len_v]
        # atten_mask -> [Batch_size, num_heads, atten_len_k, atten_len_v]  &#x8FD9;&#x91CC;&#x7684; atten_len_v == len_q
        atten_mask = atten_mask.repeat(1, num_heads, 1, 1)
        atten = ScaledDotProductAttention()(Q, K, V, atten_mask)
        atten = atten.transpose(1, 2)  # atten -> [Batch_size, atten_len_k, num_heads, dim]

        atten = atten.reshape(Batch_size, atten_len_k, -1)  # atten -> [Batch_size, atten_len_k, num_heads * dim]
        atten = self.projection(atten).to(device)  # atten -> [Batch_size, atten_len_k, embed_size] atten_len_k == len_q
        # softmax  &#x4E0D;&#x6539;&#x53D8;&#x77E9;&#x9635;&#x7684;&#x7EF4;&#x5EA6;&#xFF0C;&#x8FD9;&#x91CC;&#x5BF9;&#x884C;&#x65B9;&#x5411;&#x5BF9;&#x884C;&#x5411;&#x91CF;&#x8FDB;&#x884C;&#x5F52;&#x4E00;&#x5316;  &#x8FD9;&#x91CC;&#x5BF9;&#x8F93;&#x51FA;&#x548C;&#x6B8B;&#x5DEE; &#x8FDB;&#x884C;Add && Norm &#x64CD;&#x4F5C;
        atten_ret = (residual + torch.softmax(atten, dim=1))
        atten_ret = nn.LayerNorm(embed_size).to(device)(atten_ret)
        return atten_ret

class Feed_forward(nn.Module):
"""
    &#x5BF9;&#x5E94;&#x4E8E;&#x539F;&#x8BBA;&#x6587;&#x4E2D;&#x7684;Feed-Forward&#x6D41;&#x7A0B;
    &#x67E5;&#x770B;&#x67D0;&#x4E2A;&#x6570;&#x636E;&#x662F;&#x5426;&#x5B58;&#x50A8;&#x4E8E;cuda&#x4E0A;,&#x53EF;&#x952E;&#x5165;&#x547D;&#x4EE4;: x.is_cuda &#x5176;&#x4E2D;x&#x4E3A;&#x53D8;&#x91CF;
"""

    def __init__(self):
        super().__init__()
        self.W1 = nn.Linear(embed_size, d_ff).to(device)
        self.W2 = nn.Linear(d_ff, embed_size).to(device)
        self.b1 = torch.rand(d_ff).to(device)
        self.b2 = torch.rand(embed_size).to(device)
        self.relu = nn.ReLU().to(device)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, enc_inputs):
"""
        :param enc_inputs: # enc_inputs -> [Batch_size, seq_len, embedding_size]
        # atten -> [Batch_size, seq_len, embedding_size]
        :return:
"""
        fc1 = self.W1(enc_inputs) + self.b1
        fc1 = self.relu(fc1)
        fc2 = self.W2(fc1) + self.b2  # fc2 -> [Batch_size, seq_len, embedding_size]
        output = fc2  # output -> [Batch_size, seq_len, embedding_size]
        residual = enc_inputs
        Add_And_Norm = nn.LayerNorm(embed_size).cuda()(output + residual)
        return Add_And_Norm

class Encoder_layer(nn.Module):
    def __init__(self):
        super().__init__()
        self.multi_head_attention = Multi_Head_Attention()
        self.feed_forward = Feed_forward()

    def forward(self, enc_inputs, enc_atten_mask):
"""
        :param enc_inputs:  # enc_inputs -> [Batch_size, src_len, embedding_size]
        :param enc_atten_mask:   # enc_atten_mask -> [Batch_size, src_len, src_len]
        :return:
"""
        # &#x4F20;&#x5165;&#x591A;&#x5C42;&#x6CE8;&#x610F;&#x529B;&#x673A;&#x5236;&#x7684;&#x8F93;&#x5165;Q&#x3001;K&#x3001;V &#x90FD;&#x5047;&#x5B9A;&#x4E3A;&#x4E00;&#x6837;&#x7684;
        atten_output = self.multi_head_attention(enc_inputs, enc_inputs, enc_inputs, enc_atten_mask)  # &#x8FD9;&#x91CC;&#x5F97;&#x5230;&#x7684;&#x662F;&#x6CE8;&#x610F;&#x529B;&#x77E9;&#x9635;
        output = self.feed_forward(atten_output).to(device)  # output -> [Batch_size, seq_len, embeded_size]
        return output, atten_output

def get_attn_pad_mask(seq_q, seq_k):
"""
    :param seq_q:  seq_q -> [Batch_size, len_q]
    :param seq_k:  seq_k -> [Batch_size, len_k]
    :return:
"""
    Batch_size, len_q = seq_q.size()
    Batch_size, len_k = seq_k.size()
    atten_mask = seq_k.eq(0).unsqueeze(1)  # atten_mask -> [Batch_size, 1, len_k]
    atten_mask = atten_mask.expand(Batch_size, len_q, len_k)  # atten_mask -> [Batch_size, len_q, len_k]
    return atten_mask

def gelu(x):
"""
      Implementation of the gelu activation function.

      For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
      0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
      Also see https://arxiv.org/abs/1606.08415
      &#x8BBA;&#x6587;&#x4E2D;&#x7528;GELU&#x4EE3;&#x66FF;&#x4E86;RELU
"""
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

class BERT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embed = torch.nn.Embedding(vocab_size, embedding_size).to(device)  # &#x5BF9;token&#x8FDB;&#x884C;&#x5411;&#x91CF;&#x5316;
        self.pos_embed = torch.nn.Embedding(max_len, embedding_size).to(device)  # &#x5BF9;position&#x8FDB;&#x884C;&#x5411;&#x91CF;&#x5316;
        self.seg_embed = torch.nn.Embedding(segments_len, embedding_size).to(device)  # &#x5BF9;&#x4E0D;&#x540C;&#x7684;&#x8BED;&#x53E5;&#x8FDB;&#x884C;&#x5411;&#x91CF;&#x5316;
        self.layers = nn.ModuleList(Encoder_layer() for _ in range(n_layers))
        self.fc1 = nn.Sequential(
            nn.Linear(embedding_size, embedding_size),
            nn.Dropout(0.5),
            nn.Tanh(),
        )
        self.classifier = nn.Linear(embedding_size, 2)  # &#x5BF9;&#x4E24;&#x53E5;&#x8BDD;&#x8FDB;&#x884C;&#x5206;&#x7C7B;,isNext or not is Next
        self.fc2 = nn.Linear(embedding_size, vocab_size)
        self.linear = nn.Linear(embedding_size, embedding_size)

    def forward(self, input_token, segments_, masked_pos):
"""
        :param masked_pos: [Batch_size, n_pred]
        :param input_token: [Batch_size, seq_len]
        :param segments_:  [Batch_size, seq_len]
        :return:
"""
        Batch_size, seq_len = input_token.size()
        pos = torch.arange(seq_len, dtype=torch.long)  # [seq_len]
        pos = pos.unsqueeze(0)  # [1, seq_len]
        pos = pos.repeat(Batch_size, 1).to(device)  # [Batch_size, seq_len]
        # input_token_embed -> [Batch_size, seq_len, embedding_size]
        input_token_embed = self.token_embed(input_token) + self.seg_embed(segments_) + self.pos_embed(pos)
        enc_atten_mask = get_attn_pad_mask(input_token, input_token)  # [Batch_size, seq_len, seq_len]
        output = input_token_embed
        for layer in self.layers:
            output, _ = layer(output, enc_atten_mask)  # output [Batch_size, seq_len, embedding_size]
        _, seq_len, _ = output.size()
        nsp_output = output
        nsp_output = self.fc1(nsp_output)  # [Batch_size, seq_len, embedding_size]
        nsp_output = self.classifier(nsp_output)  # [Batch_size, seq_len, 2]
        nsp_output = torch.sum(nsp_output.transpose(2, 1), dim=-1)  # [Batch_size, 2]
        nsp_output = torch.softmax(nsp_output, dim=-1)  # [Batch_size, 2]

        masked_pos = masked_pos.unsqueeze(-1).repeat(1, 1, embedding_size)  # [Batch_size, n_pred, embedding_size]
        nlm_output = torch.gather(output, 1, masked_pos)
        nlm_output = gelu(self.linear(nlm_output))  # [Batch_size, n_pred, embedding_size]
        nlm_output = self.fc2(nlm_output)  # [Batch_size, n_pred, vocab_size]
        return nsp_output, nlm_output

def train():
    batch = make_data(token_list)
    input_ids, segment_ids, masked_pos, masked_tokens, isNext = zip(*batch)
    input_ids, segment_ids, masked_pos, masked_tokens, isNext = torch.LongTensor(input_ids), torch.LongTensor(
        segment_ids), torch.LongTensor(masked_pos), torch.LongTensor(masked_tokens), torch.LongTensor(isNext)
    train_data = my_dataset(input_ids, segment_ids, masked_pos, masked_tokens, isNext)
    train_iter = DataLoader(train_data, batch_size, shuffle=True)
    crition = torch.nn.CrossEntropyLoss()  # &#x62EC;&#x53F7;&#x91CC;&#x9762;&#x7684;&#x53C2;&#x6570;&#x987A;&#x5E8F;&#x5E94;&#x8BE5;&#x662F;(label, target)
    model = BERT().train()
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    for step in range(1000):
        for input_ids_i, segment_ids_i, masked_pos_i, masked_tokens_i, isNext_i in train_iter:
            input_ids_i, segment_ids_i, masked_pos_i, masked_tokens_i, isNext_i = input_ids_i.to(device), \
                                                                                  segment_ids_i.to(device), \
                                                                                  masked_pos_i.to(device), \
                                                                                  masked_tokens_i.to(device), \
                                                                                  isNext_i.to(device)
            optimizer.zero_grad()
            nsp_out, nlm_out = model(input_ids_i, segment_ids_i, masked_pos_i)
            classify_loss = crition(nsp_out, isNext_i)
            masked_tokens_i = masked_tokens_i.view(-1)
            nlm_out = nlm_out.view(-1, vocab_size)
            nlm_out = nlm_out
            nlm_loss = 0
            nlm_loss = crition(nlm_out, masked_tokens_i)
            nlm_loss = nlm_loss.mean()
            loss = nlm_loss + classify_loss
            loss.backward()
            optimizer.step()
            if step % 100 == 0:
                print("step {0} loss {1} loss {2}".format(step, nlm_loss, classify_loss))

if __name__ == '__main__':
    train()
</=></=>

目前NLP系列最后一道坎,Bert模型

Bert采用了Transformers模型中的Encoder模型,这里有6层Encoder。每层Encoder有12层多层注意力层(Multi-Head Attention)。

其中,输入语料库依次为9个句子。我在这里使用这九句话作为一个小数据集。

[En]

Among them, the input corpus is 9 sentences in sequence. I use these nine sentences as a small data set here.

随机在语料库中选取两句话,如果这两句话有在整个文档中有先后顺序,那么isNext 字段为True

对于选取的两句话中的每一句话,随机Mask其中的某几个token,token就是字符级别的。

masked的方式有三种,80%的可能性将其变成'[MASKED]’,10% 的可能性将其变成其他的token,还剩下10%的可能性将其保持不变。

这样做的好处就是,训练模型能够根据上下文预测当前被Masked的token,灵盖是由Word2Vec的EBOW激发而来。因为EBOW也是由一个词周边的词语预测当前词。

至于要将剩下的10%的token 不进行任何处理,是因为下游任务中是没有训练集,也就是没有’MASKED’,为了让模型能够适配下游任务,这里做了相应的处理。

            if p > 0.8:
                input_ids[cand_rep_idx] = word2idx['MASK']
            elif p > 0.1:
                other_idx = random.randrange(len(input_ids))
                input_ids[cand_rep_idx] = input_ids[other_idx]
            else:
                input_ids[cand_rep_idx] = input_ids[word]

训练集中的每一条语句有三部分相加而成。input_ids, segment_ids, isNext

input_ids的构造方式上文讲了,segment_ids是为了区分两句话中,之间的分割

eg:segment_ids [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

将[input_ids , segment_ids, isNext] 字段经过一个Embedding层,得到一个大小为[Batch_size, seq_len, embedding_size]大小的张量。这里的embeding层中对pos位置的处理方法相对于Transformer有点不同,这里采用的是一个Embedding的矩阵来表示位置,该矩阵是可以学习的。

pos = torch.arange(seq_len, dtype=torch.long)  # [seq_len]
pos = pos.unsqueeze(0)  # [1, seq_len]
pos = pos.repeat(Batch_size, 1).to(device)  # [Batch_size, seq_len]

张量经过num_layers 个Encoder层,输出接两个全连接层,网络设计完毕

模型的loss 分两个,一个是MLM的loss 一个是NSP的loss

MLM的loss 仅仅针对被masked的token的loss(nlm_loss),NSP的loss是对句向量进行分类的loss(nsp_loss)。但是由于nsp_loss 太小, nlm_loss 对整个loss的影响太大,nsp_loss下降不是很明显

Original: https://blog.csdn.net/linxizi0622/article/details/125710630
Author: linxizi0622
Title: 基于Pytorch+Bert的预训练模型



相关阅读

Title: pandas中的corr()_在Python中使用Pandas

Pandas是一个python库,用于处理数据、生成统计数据、聚合数据等等。在这篇文章中,我们将讨论如何使用Pandas库进行数据选择、聚合和统计分析。

我们开始吧!

我们将使用银行客户流失对数据集进行建模。数据可以在这里找到。

[En]

We will use the bank customer churn to model the dataset. The data can be found here.

https://www.kaggle.com/sanjanavoona1043/bank-churn

首先,我们导入Pandas库,打印前五行数据:

import&#xA0;pandas&#xA0;as&#xA0;pddf&#xA0;=&#xA0;pd.read_csv("Bank_churn_modelling.csv")pd.set_option('display.max_columns',&#xA0;None)pd.set_option('display.max_rows',&#xA0;None)print(df.head())

数据选择

这里我们将考虑使用Pandas数据帧进行数据选择。我们可以使用”[]”来选择数据列。例如,如果要选择”CreditScore”、”Gender”、”Age”和”Exited”,可以执行以下操作:

df_select&#xA0;=&#xA0;df[['CreditScore',&#xA0;'Gender',&#xA0;'Age',&#xA0;&#xA0;'Exited']]print(df_select.head())

您还可以按列值筛选原始数据框。让我们筛选原始数据,以仅包括40岁以上的客户:

[En]

You can also filter raw data frames by column values. Let’s filter the raw data to include only customers over the age of 40:

df_age_gt_40&#xA0;=&#xA0;df[df['Age']&#xA0;>&#xA0;40]print(df_age_gt_40.head())

我们还可以筛选40岁或以下的客户:

df_age_lte_40&#xA0;=&#xA0;df[df['Age']&#xA0;<=40]print(df_age_lte_40.head())< code></=40]print(df_age_lte_40.head())<>

或者只有40岁的客户:

df_age_e_40&#xA0;=&#xA0;df[df['Age']&#xA0;==40]print(df_age_e_40.head())

我们也可以按类别过滤。例如,我们可以选择”Geography”为France的数据:

df_france&#xA0;=&#xA0;df[df['Geography']&#xA0;==&#xA0;'France']print(df_france.head())

还可以使用”.loc[]”运算符来完成相同的任务:

df_france_loc&#xA0;=&#xA0;df.loc[df.Geography&#xA0;==&#xA0;'France']print(df_france_loc.head())

我们还可以过滤多个分类值。我们只考虑来自德国和西班牙的客户:

[En]

We can also filter multiple classification values. We only consider customers from Germany and Spain:

geography_list&#xA0;=&#xA0;['Germany',&#xA0;'Spain']df_germany_spain&#xA0;=&#xA0;df[df['Geography'].isin(geography_list)]print(df_germany_spain.head())

统计

我们也可以用Pandas来生成一些基本的统计数据。例如,如果我们想计算信用评分的平均值和标准差,可以执行以下操作:

mean_credit_score&#xA0;=&#xA0;df['CreditScore'].mean()print('Mean&#xA0;credit&#xA0;Score:&#xA0;',&#xA0;mean_credit_score)std_credit_score&#xA0;=&#xA0;df['CreditScore'].std()print('Standard&#xA0;Deviation&#xA0;in&#xA0;Credit&#xA0;Score:&#xA0;',&#xA0;std_credit_score)

您还可以查看最小值和最大值:

[En]

You can also view the minimum and maximum values:

min_credit_score&#xA0;=&#xA0;df['CreditScore'].min()print('Min&#xA0;credit&#xA0;Score:&#xA0;',&#xA0;min_credit_score)max_credit_score&#xA0;=&#xA0;df['CreditScore'].max()print('Standard&#xA0;Credit&#xA0;Score:&#xA0;',&#xA0;max_credit_score)

您还可以计算要素的相关性并绘制热图。看看“年龄”、“信用评分”、“估计资历”和“任期”之间的相关性:

[En]

You can also calculate the correlation of features and draw heat maps. Take a look at the correlation between “age”, “credit score”, “estimated seniority” and “tenure”:

corr&#xA0;=&#xA0;df[['Age',&#xA0;'CreditScore',&#xA0;'EstimatedSalary',&#xA0;'Tenure']].corr()print(corr)

可以使用seaborn绘制相关值的热图:

import&#xA0;seaborn&#xA0;as&#xA0;sns&#xA0;sns.heatmap(corr)

数据聚合

我们也可以用Pandas来收集数据。具体来说,可以使用groupby方法来生成类别级别的统计信息。例如,我们可以从原始数据生成一个新的dataframe,其中包含每个”Geography”值的平均”CreditScore”。这将使我们能够分析和比较法国、西班牙和德国的平均信用评分:

df_groupby_mean&#xA0;=&#xA0;df.groupby('Geography')['CreditScore'].mean()print(df_groupby_mean.head())

您还可以查看信用评分的标准差:

[En]

You can also view the standard deviation of the credit score:

df_groupby_std&#xA0;=&#xA0;df.groupby('Geography')['CreditScore'].std()print(df_groupby_std.head())

让我们看看另一个数字列。让我们计算一下每个国家的平均年龄和年龄标准差:

[En]

Let’s look at another number column. Let’s calculate the average age and age standard deviation for each country:

df_groupby_age_mean&#xA0;=&#xA0;df.groupby('Geography')['Age'].mean()print(df_groupby_age_mean.head())df_groupby_age_std&#xA0;=&#xA0;df.groupby('Geography')['Age'].std()print(df_groupby_age_std.head())

我们还可以按多列进行分组:

[En]

We can also group by multiple columns:

df_groupby_multiple_category&#xA0;=&#xA0;df.groupby(['Geography',&#xA0;'Gender'])['Age'].mean()print(df_groupby_multiple_category.head())

结论

本文讨论了如何与Pandas库一起执行数据选择、数据聚合和统计分析等任务。

Original: https://blog.csdn.net/weixin_34738099/article/details/113538112
Author: 宫乘风
Title: pandas中的corr()_在Python中使用Pandas

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/262777/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载