基因变异自动分类

来源:kaggle.com/elemento/personalizedmedicine-rct

在癌细胞的生命周期中可能会发生数千次的基因突变(genetic mutations),有些突变是”坏的”(有助于肿瘤的生长),有些突变是”好的”。因此,对这些突变进行准确的识别对于治疗方案的制定是具有重要意义的。

传统的对基因突变的解释是手工进行的。这是一项非常耗时的任务,临床病理学家必须根据基于文本的临床文献的证据手动审查和分类每一个基因突变。

因此,如果可以开发一种机器学习算法,可以根据研究人员注释的那些突变作为训练集,自动对遗传变异进行分类,将节省大量的人力。

本任务预定义了9种类别,每个基因突变都属于这9种之一。

训练集和测试集都是通过两个不同的文件提供的。一个文件提供了基因的变异信息(在 training_variants/ test_variants中);另一个文件提供了人类专家用来对基因进行分类的临床证据(文本数据,在 training_text/ test_text中)。两者通过 ID字段进行关联。

3.1 安装依赖

如果系统的python环境是通过Anaconda安装的,那么仍需要安装以下两个第三方依赖

!pip install mlxtend
!pip install imbalanced-learn
!pip install nltk
!pip install seaborn

如果上述命令(一般是第二个)报如下错误:

ERROR: Could not install packages due to an OSError: [Errno 13] Permission denied: 'COPYING' Consider using the --user option or check the permissions.

那么,只需要将上述安装代码改为 !pip install --user imbalanced-learn即可。

3.2 导入所有依赖

执行以下代码导入所有依赖:

import re
import nltk
import math
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from mlxtend.classifier import StackingClassifier
from tqdm import tqdm
from collections import  defaultdict
from scipy.sparse import hstack
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics._classification import log_loss
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

warnings.filterwarnings('ignore')

nltk.download('stopwords')

4.1 工具函数

在数据探索的过程中,对具有共同逻辑的代码抽象成工具函数,从而提升效率。

def plot_confusion_matrix(y_true, y_predict):
"""
    给定真实值和预测值,画出其混淆矩阵、查准率矩阵与查全率矩阵。
"""
    C = confusion_matrix(y_true, y_predict)
    A = C/C.sum(axis=1)
    B = C/C.sum(axis=0)

    labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]

    print('-'*20, 'Confusion matrix', '-'*20)
    plt.figure(figsize=(20, 7))
    sns.heatmap(C, annot=True, cmap='YlGnBu', fmt='.3f', xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted class")
    plt.ylabel('True class')
    plt.show()

    print('-'*20, 'Precision matrix', '-'*20)
    plt.figure(figsize=(20, 7))
    sns.heatmap(B, annot=True, cmap='YlGnBu', fmt='.3f', xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted class")
    plt.ylabel('True class')
    plt.show()

    print('-'*20, 'Recall matrix', '-'*20)
    plt.figure(figsize=(20, 7))
    sns.heatmap(A, annot=True, cmap='YlGnBu', fmt='.3f', xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted class")
    plt.ylabel('True class')
    plt.show()

def get_feature_ratio_dict(alpha, feature_name, df):
"""
    用于Response编码。

    获取基因数据有关特征在各个变异类别中的的占比数据,这里的特征可以是基因
    名称('GENE'),也可以是变异名称('Variation')。

    alpha: 拉普拉斯平滑系数
    feature_name: 特征名称
    df: 存储原始数据的data frame
"""
    value_count = df[feature_name].value_counts()
    fea_dict = dict()

    for i, denominator in value_count.items():
        vec = []
        for k in range(1, 10):
            sub_df = df.loc[(df['Class']==k) & (df[feature_name]==i)]
            vec.append((sub_df.shape[0]+alpha*10) / (denominator+90*alpha))
        fea_dict[i] = vec

    return fea_dict

def get_response_code(alpha, feature_name, df):
"""
    用于Response编码。

    对于df中每一个实例的指定列进行Response编码处理。

    alpha: 拉普拉斯平滑系数
    feature_name: 特征名称
    df: 存储原始数据的data frame
"""
    fea_dict = get_feature_ratio_dict(alpha, feature_name, df)
    fea = []

    for _, row in df.iterrows():
        if row[feature_name] in fea_dict:
            fea.append(fea_dict[row[feature_name]])
        else:
            fea.append([1/9] * 9)

    return fea

def get_word_freq(df):
    """对于输入的df,统计其"TEXT"字段下所有内容的各词词频"""
    freq_dict = defaultdict(int)

    for _, row in df.iterrows():
        for word in row['TEXT'].split():
            freq_dict[word] += 1
    return freq_dict

def get_word_freq_list(df):
    """统计每个变异类别所对应的所有出现过的描述文本的各词词频"""
    freq_dict_list = []

    for i in range(1, 10):
        sub_df = df[df['Class'] == i]
        freq_dict_list.append(get_word_freq(sub_df))
    return freq_dict_list

def get_text_response(df):
    """对于描述文本,提取其Response编码"""
    text_feature_responsecoding = np.zeros((df.shape[0], 9))
    freq_dict_list = get_word_freq_list(df)
    all_freq_dict = get_word_freq(df)

    for i in range(0, 9):
        row_index = 0
        for _, row in df.iterrows():
            sum_prob = 0
            for word in row['TEXT'].split():
                sum_prob += math.log((freq_dict_list[i].get(word, 0)+10) / (all_freq_dict.get(word, 0)+90))
            text_feature_responsecoding[row_index][i] = math.exp(sum_prob / len(row['TEXT'].split()))
            row_index += 1
    return text_feature_responsecoding

4.2 原始数据读入


cls_data = pd.read_csv('./my_data/training_variants.zip')

text_data = pd.read_csv('./my_data/training_text.zip', sep='\|\|', names=['ID', "TEXT"], skiprows=1, encoding='utf-8')

4.3 停用词处理及数据集合并

过滤掉停用词有利于提高增大特征方差,从而提升模型的效果。

这里直接使用了 nltk自带的停用词表:


stop_words = set(stopwords.words('english'))

def nlp_preprocessing(total_text, index, column):
    """过滤停用词以及其他一些非法字符"""
    if type(total_text) is not int:
        string = ''
        total_text = re.sub('[^a-zA-Z0-9\n]', ' ', total_text)
        total_text = re.sub('\s+', ' ', total_text)
        total_text = total_text.lower()

        for word in total_text.split():
            if not word in stop_words:
                string += word + ' '
        text_data[column][index] = string

pbar = tqdm(total=len(text_data))
for index, row in text_data.iterrows():
    if type(row['TEXT']) is str:
        nlp_preprocessing(row['TEXT'], index, 'TEXT')
    else:
        print("对于id为", index, "的gene变异没有文本描述")
    pbar.update(1)

all_data = pd.merge(cls_data, text_data, on='ID', how='left')

all_data.loc[all_data['TEXT'].isnull(), 'TEXT'] = all_data['Gene'] + ' ' + all_data['Variation']

4.4 特征提取方法

由于原始特征均为文本,因此,需要对其进行编码处理。尝试的编码方式分为两种: OneHot编码_与 _Response编码

在文本处理领域,OneHot编码提供了一种由字符到数字的转换方式。其计算过程简述如下:

执行完上述操作后,就得到了经过OneHot编码后的特征向量。

Response编码是针对文本数据的一种处理方式,它通过对词频的统计与计算,得到每个词的特征向量,并用该向量来代替原始词,从而完成从文本到数字的转换。其计算过程概述如下:

对于给定的一列文本数据[c h a r 1 , c h a r 2 , . . . , c h a r n char_1,char_2,…,char_n c h a r 1 ​,c h a r 2 ​,…,c h a r n ​]以及它们对应的类别标签 [1, 2, 1, ..., 9](这里假设一共有9个类别),其中c h a r i char_i c h a r i ​表示一个单词,对该文本数据进行Response编码的步骤如下;

如果c h a r i char_i c h a r i ​对应的是一个长句子而非单词,那么首先需要将其进行分词处理,再进行编码。具体的过程简述如下:

4.5 数据探索


y_true = all_data['Class'].values
train_val_x, X_test, train_val_y, y_test = train_test_split(all_data, y_true, stratify=y_true, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(train_val_x, train_val_y, stratify=train_val_y, test_size=0.2)
print('Number of data points in train data            :', X_train.shape[0])
print('Number of data points in test data             :', X_test.shape[0])
print('Number of data points in cross-validation data :', X_val.shape[0])

train_class_distr = X_train['Class'].value_counts().sort_index()
train_class_distr.plot(kind='bar')
plt.xlabel('Class')
plt.ylabel('Data points per class')
plt.title('Distribution of y in train data')
plt.grid()
plt.show()

test_class_distr = X_val['Class'].value_counts().sort_index()
test_class_distr.plot(kind='bar')
plt.xlabel('Class')
plt.ylabel('Data points per class')
plt.title('Distribution of y in test data')
plt.grid()
plt.show()

val_class_distr = X_test['Class'].value_counts().sort_index()
val_class_distr.plot(kind='bar')
plt.xlabel('Class')
plt.ylabel('Data points per class')
plt.title('Distribution of y in validation data')
plt.grid()
plt.show()

unique_genes = X_train['Gene'].value_counts()
print(f"Number of unique genes in train data is: {len(unique_genes)}. The distribution of genes in trian data is as follows:")
s = sum(unique_genes.values)
h = unique_genes.values / s
plt.plot(h, label="Histrogram of Genes")
plt.xlabel('Index of a Gene')
plt.ylabel('Frequency of Occurances')
plt.legend()
plt.grid()
plt.show()

train_gene_response = np.array(get_response_code(1, 'Gene', X_train))
test_gene_response = np.array(get_response_code(1, 'Gene', X_test))
val_gene_response = np.array(get_response_code(1, 'Gene', X_val))

print(f"The shape of response coded feature is {train_gene_response.shape[1]}")

gene_vectorizer = CountVectorizer()
train_gene_onehot = gene_vectorizer.fit_transform(X_train['Gene'])
test_gene_onehot = gene_vectorizer.transform(X_test['Gene'])
val_gene_onehot = gene_vectorizer.transform(X_val['Gene'])

print(f"The shape of onehot coded feature is {train_gene_onehot.shape[1]}")

接下来,我们仅利用Gene特征来构建机器学习模型以评估其效果


alpha = [10**x for x in range(-5, 1)]
loss_list = []

for i in alpha:
    clf = SGDClassifier(alpha=i, loss='log')

    sig_clf = CalibratedClassifierCV(clf)
    sig_clf.fit(train_gene_onehot, y_train)
    y_pred = sig_clf.predict_proba(val_gene_onehot)
    loss_ = log_loss(y_val, y_pred)
    print(f"For alpha={i}, loss is {loss_}")
    loss_list.append(loss_)

fig, ax = plt.subplots()
ax.plot(alpha, loss_list, c='g')
for i, txt in enumerate(np.round(loss_list, 3)):
    ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], loss_list[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()

best_alpha = alpha[np.argmin(loss_list)]
clf = SGDClassifier(alpha=best_alpha, loss='log')
sig_clf = CalibratedClassifierCV(clf)
sig_clf.fit(train_gene_onehot, y_train)

y_pred = sig_clf.predict_proba(train_gene_onehot)
print('For values of best alpha:', best_alpha, "The train log loss is:", log_loss(y_train, y_pred))

y_pred = sig_clf.predict_proba(val_gene_onehot)
print('For values of best alpha:', best_alpha, "The cross val log loss is:", log_loss(y_val, y_pred))

y_pred = sig_clf.predict_proba(test_gene_onehot)
print('For values of best alpha:', best_alpha, "The test log loss is:", log_loss(y_test, y_pred,))

unique_genes = X_train['Variation'].value_counts()
print(f"Number of unique variation in train data is: {len(unique_genes)}. The distribution of variation in trian data is as follows:")
s = sum(unique_genes.values)
h = unique_genes.values / s
plt.plot(h, label="Histrogram of Variation")
plt.xlabel('Index of a Variation')
plt.ylabel('Frequency of Occurances')
plt.legend()
plt.grid()
plt.show()

train_varin_response = np.array(get_response_code(1, 'Variation', X_train))
test_varin_response = np.array(get_response_code(1, 'Variation', X_test))
val_varin_response = np.array(get_response_code(1, 'Variation', X_val))

print(f"The shape of response coded feature is {train_varin_response.shape[1]}")

varin_vectorizer = CountVectorizer()
train_varin_onehot = gene_vectorizer.fit_transform(X_train['Variation'])
test_varin_onehot = gene_vectorizer.transform(X_test['Variation'])
val_varin_onehot = gene_vectorizer.transform(X_val['Variation'])

print(f"The shape of onehot coded feature is {train_varin_onehot.shape[1]}")

接下来,我们仅利用Variation特征来构建机器学习模型以评估其效果


alpha = [10**x for x in range(-5, 1)]
loss_list = []

for i in alpha:
    clf = SGDClassifier(alpha=i, loss='log')

    sig_clf = CalibratedClassifierCV(clf)
    sig_clf.fit(train_varin_onehot, y_train)
    y_pred = sig_clf.predict_proba(val_varin_onehot)
    loss_ = log_loss(y_val, y_pred)
    print(f"For alpha={i}, loss is {loss_}")
    loss_list.append(loss_)

fig, ax = plt.subplots()
ax.plot(alpha, loss_list, c='g')
for i, txt in enumerate(np.round(loss_list, 3)):
    ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], loss_list[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()

best_alpha = alpha[np.argmin(loss_list)]
clf = SGDClassifier(alpha=best_alpha, loss='log')
sig_clf = CalibratedClassifierCV(clf)
sig_clf.fit(train_varin_onehot, y_train)

y_pred = sig_clf.predict_proba(train_varin_onehot)
print('For values of best alpha:', best_alpha, "The train log loss is:", log_loss(y_train, y_pred))

y_pred = sig_clf.predict_proba(val_varin_onehot)
print('For values of best alpha:', best_alpha, "The cross val log loss is:", log_loss(y_val, y_pred))

y_pred = sig_clf.predict_proba(test_varin_onehot)
print('For values of best alpha:', best_alpha, "The test log loss is:", log_loss(y_test, y_pred,))

train_text_response = get_text_response(X_train)
test_text_response = get_text_response(X_test)
val_text_response = get_text_response(X_val)

train_text_response = (train_text_response.T / train_text_response.sum(axis=1)).T
test_text_response = (test_text_response.T / test_text_response.sum(axis=1)).T
val_text_response = (val_text_response.T / val_text_response.sum(axis=1)).T

text_vectorizer = CountVectorizer(min_df=3)
train_text_onehot = text_vectorizer.fit_transform(X_train['TEXT'])
test_text_onehot = text_vectorizer.transform(X_test['TEXT'])
val_text_onehot = text_vectorizer.transform(X_val['TEXT'])

train_text_onehot = normalize(train_text_onehot, axis=0)
test_text_onehot = normalize(test_text_onehot, axis=0)
val_text_onehot = normalize(val_text_onehot, axis=0)

alpha = [10**x for x in range(-5, 1)]
loss_list = []

for i in alpha:
    clf = SGDClassifier(alpha=i, loss='log')

    sig_clf = CalibratedClassifierCV(clf)
    sig_clf.fit(train_text_onehot, y_train)
    y_pred = sig_clf.predict_proba(val_text_onehot)
    loss_ = log_loss(y_val, y_pred)
    print(f"For alpha={i}, loss is {loss_}")
    loss_list.append(loss_)

fig, ax = plt.subplots()
ax.plot(alpha, loss_list, c='g')
for i, txt in enumerate(np.round(loss_list, 3)):
    ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], loss_list[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()

best_alpha = alpha[np.argmin(loss_list)]
clf = SGDClassifier(alpha=best_alpha, loss='log')
sig_clf = CalibratedClassifierCV(clf)
sig_clf.fit(train_text_onehot, y_train)

y_pred = sig_clf.predict_proba(train_text_onehot)
print('For values of best alpha:', best_alpha, "The train log loss is:", log_loss(y_train, y_pred))

y_pred = sig_clf.predict_proba(val_text_onehot)
print('For values of best alpha:', best_alpha, "The cross val log loss is:", log_loss(y_val, y_pred))

y_pred = sig_clf.predict_proba(test_text_onehot)
print('For values of best alpha:', best_alpha, "The test log loss is:", log_loss(y_test, y_pred,))

5.1 预备操作

train_onehot = hstack((hstack((train_gene_onehot, train_varin_onehot)), train_text_onehot)).tocsr()
test_onehot = hstack((hstack((test_gene_onehot, test_varin_onehot)), test_text_onehot)).tocsr()
val_onehot = hstack((hstack((val_gene_onehot, val_varin_onehot)), val_text_onehot)).tocsr()

print("OneHot encoding feature:")
print(f"Train data: {train_onehot.shape}")
print(f"Test data: {test_onehot.shape}")
print(f"Val data: {val_onehot.shape}")
train_response = np.hstack((np.hstack((train_gene_response, train_varin_response)), train_text_response))
test_response = np.hstack((np.hstack((test_gene_response, test_varin_response)), test_text_response))
val_response = np.hstack((np.hstack((val_gene_response, val_varin_response)), val_text_response))

print("Response encoding feature:")
print(f"Train data: {train_response.shape}")
print(f"Test data: {test_response.shape}")
print(f"Val data: {val_response.shape}")

5.2 Naive Bayes模型

alpha = [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]
loss_list = []

for i in alpha:
    clf = MultinomialNB(alpha=i)

    sig_clf = CalibratedClassifierCV(clf)
    sig_clf.fit(train_onehot, y_train)
    y_pred = sig_clf.predict_proba(val_onehot)
    loss_ = log_loss(y_val, y_pred)
    loss_list.append(loss_)
    print(f"For alpha={i}, log Loss:", loss_)
fig, ax = plt.subplots()
ax.plot(np.log10(alpha), loss_list, c='g')
for i, txt in enumerate(np.round(loss_list, 3)):
    ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]), loss_list[i]))

plt.grid()
plt.xticks(np.log10(alpha))
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()

best_alpha = alpha[np.argmin(loss_list)]
clf = SGDClassifier(alpha=best_alpha, loss='log')
sig_clf = CalibratedClassifierCV(clf)
sig_clf.fit(train_onehot, y_train)

y_pred = sig_clf.predict_proba(train_onehot)
print('For values of best alpha:', best_alpha, "The train log loss is:", log_loss(y_train, y_pred))

y_pred = sig_clf.predict_proba(val_onehot)
print('For values of best alpha:', best_alpha, "The cross val log loss is:", log_loss(y_val, y_pred))

y_pred = sig_clf.predict_proba(test_onehot)
print('For values of best alpha:', best_alpha, "The test log loss is:", log_loss(y_test, y_pred,))

plot_confusion_matrix(y_test, sig_clf.predict(test_onehot))

5.3 KNN模型

alpha = [5, 11, 15, 21, 31, 41, 51, 99]
loss_list = []

for i in alpha:
    clf = KNeighborsClassifier(n_neighbors=i)

    sig_clf = CalibratedClassifierCV(clf)
    sig_clf.fit(train_response, y_train)
    y_pred = sig_clf.predict_proba(val_response)
    loss_ = log_loss(y_val, y_pred)
    loss_list.append(loss_)
    print(f"For alpha={i}, log loss is: {loss_}")
fig, ax = plt.subplots()
ax.plot(np.log10(alpha), loss_list, c='g')
for i, txt in enumerate(np.round(loss_list, 3)):
    ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]), loss_list[i]))

plt.grid()
plt.xticks(np.log10(alpha))
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()

best_alpha = alpha[np.argmin(loss_list)]
clf = KNeighborsClassifier(n_neighbors=best_alpha)
sig_clf = CalibratedClassifierCV(clf)
sig_clf.fit(train_response, y_train)

y_pred = sig_clf.predict_proba(train_response)
print('For values of best alpha:', best_alpha, "The train log loss is:", log_loss(y_train, y_pred))

y_pred = sig_clf.predict_proba(val_response)
print('For values of best alpha:', best_alpha, "The cross val log loss is:", log_loss(y_val, y_pred))

y_pred = sig_clf.predict_proba(test_response)
print('For values of best alpha:', best_alpha, "The test log loss is:", log_loss(y_test, y_pred,))

plot_confusion_matrix(y_test, sig_clf.predict(test_response))

5.4 LR模型

alpha = [10**x for x in range(-6, 3)]
loss_list = []

for i in alpha:
    clf = SGDClassifier(class_weight='balanced', alpha=i, loss='log')

    sig_clf = CalibratedClassifierCV(clf)
    sig_clf.fit(train_onehot, y_train)
    y_pred = sig_clf.predict_proba(val_onehot)
    loss_ = log_loss(y_val, y_pred)
    loss_list.append(loss_)
    print(f"For alpha={i}, log loss is: {loss_}")

fig, ax = plt.subplots()
ax.plot(np.log10(alpha), loss_list, c='g')
for i, txt in enumerate(np.round(loss_list, 3)):
    ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]), loss_list[i]))

plt.grid()
plt.xticks(np.log10(alpha))
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()

best_alpha = alpha[np.argmin(loss_list)]
clf = SGDClassifier(class_weight='balanced', alpha=best_alpha, loss='log')
sig_clf = CalibratedClassifierCV(clf)
sig_clf.fit(train_onehot, y_train)

y_pred = sig_clf.predict_proba(train_onehot)
print('For values of best alpha:', best_alpha, "The train log loss is:", log_loss(y_train, y_pred))

y_pred = sig_clf.predict_proba(val_onehot)
print('For values of best alpha:', best_alpha, "The cross val log loss is:", log_loss(y_val, y_pred))

y_pred = sig_clf.predict_proba(test_onehot)
print('For values of best alpha:', best_alpha, "The test log loss is:", log_loss(y_test, y_pred,))
plot_confusion_matrix(y_test, sig_clf.predict(test_onehot))
alpha = [10**x for x in range(-6, 3)]
loss_list = []

for i in alpha:
    clf = SGDClassifier(alpha=i, loss='log')

    sig_clf = CalibratedClassifierCV(clf)
    sig_clf.fit(train_onehot, y_train)
    y_pred = sig_clf.predict_proba(val_onehot)
    loss_ = log_loss(y_val, y_pred)
    loss_list.append(loss_)
    print(f"For alpha={i}, log loss is: {loss_}")
fig, ax = plt.subplots()
ax.plot(np.log10(alpha), loss_list, c='g')
for i, txt in enumerate(np.round(loss_list, 3)):
    ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]), loss_list[i]))

plt.grid()
plt.xticks(np.log10(alpha))
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()

best_alpha = alpha[np.argmin(loss_list)]
clf = SGDClassifier(alpha=best_alpha, loss='log')
sig_clf = CalibratedClassifierCV(clf)
sig_clf.fit(train_onehot, y_train)

y_pred = sig_clf.predict_proba(train_onehot)
print('For values of best alpha:', best_alpha, "The train log loss is:", log_loss(y_train, y_pred))

y_pred = sig_clf.predict_proba(val_onehot)
print('For values of best alpha:', best_alpha, "The cross val log loss is:", log_loss(y_val, y_pred))

y_pred = sig_clf.predict_proba(test_onehot)
print('For values of best alpha:', best_alpha, "The test log loss is:", log_loss(y_test, y_pred,))

plot_confusion_matrix(y_test, sig_clf.predict(test_onehot))

5.5 SVM模型

alpha = [10**x for x in range(-5, 3)]
loss_list = []

for i in alpha:
    clf = SGDClassifier(class_weight='balanced', alpha=i, loss='hinge')

    sig_clf = CalibratedClassifierCV(clf)
    sig_clf.fit(train_onehot, y_train)
    y_pred = sig_clf.predict_proba(val_onehot)
    loss_ = log_loss(y_val, y_pred)
    loss_list.append(loss_)
    print(f"For alpha={i}, log loss is: {loss_}")
fig, ax = plt.subplots()
ax.plot(np.log10(alpha), loss_list, c='g')
for i, txt in enumerate(np.round(loss_list, 3)):
    ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]), loss_list[i]))

plt.grid()
plt.xticks(np.log10(alpha))
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()

best_alpha = alpha[np.argmin(loss_list)]
clf = SGDClassifier(class_weight='balanced', alpha=best_alpha, loss='hinge')
sig_clf = CalibratedClassifierCV(clf)
sig_clf.fit(train_onehot, y_train)

y_pred = sig_clf.predict_proba(train_onehot)
print('For values of best alpha:', best_alpha, "The train log loss is:", log_loss(y_train, y_pred))

y_pred = sig_clf.predict_proba(val_onehot)
print('For values of best alpha:', best_alpha, "The cross val log loss is:", log_loss(y_val, y_pred))

y_pred = sig_clf.predict_proba(test_onehot)
print('For values of best alpha:', best_alpha, "The test log loss is:", log_loss(y_test, y_pred,))

plot_confusion_matrix(y_test, sig_clf.predict(test_onehot))

5.6 RF模型

alpha = [50, 100,200,300, 500]
max_depth = [5, 10]
loss_list = []

for i in alpha:
    for j in max_depth:
        clf = RandomForestClassifier(n_estimators=i, max_depth=j, n_jobs=-1)

        sig_clf = CalibratedClassifierCV(clf)
        sig_clf.fit(train_onehot, y_train)
        y_pred = sig_clf.predict_proba(val_onehot)
        loss_ = log_loss(y_val, y_pred)
        loss_list.append(loss_)
        print(f"For estimators={i}, depth={j}, log loss is: {loss_}")

best_index = np.argmin(loss_list)
best_alpha = alpha[int(best_index/2)]
best_depth = max_depth[int(best_index%2)]
clf = RandomForestClassifier(n_estimators=best_alpha, max_depth=best_depth, n_jobs=-1)
sig_clf = CalibratedClassifierCV(clf)
sig_clf.fit(train_onehot, y_train)

y_pred = sig_clf.predict_proba(train_onehot)
print('For values of best alpha:', best_alpha, ', best max-depth:', best_depth, "The train log loss is:", log_loss(y_train, y_pred))

y_pred = sig_clf.predict_proba(val_onehot)
print('For values of best alpha:', best_alpha, ', best max-depth:', best_depth, "The cross val log loss is:", log_loss(y_val, y_pred))

y_pred = sig_clf.predict_proba(test_onehot)
print('For values of best alpha:', best_alpha, ', best max-depth:', best_depth, "The test log loss is:", log_loss(y_test, y_pred,))

plot_confusion_matrix(y_test, sig_clf.predict(test_onehot))
alpha = [50, 100,200,300, 500]
max_depth = [2, 3, 5, 10]
loss_list = []

for i in alpha:
    for j in max_depth:
        clf = RandomForestClassifier(n_estimators=i, max_depth=j, n_jobs=-1)

        sig_clf = CalibratedClassifierCV(clf)
        sig_clf.fit(train_onehot, y_train)
        y_pred = sig_clf.predict_proba(val_onehot)
        loss_ = log_loss(y_val, y_pred)
        loss_list.append(loss_)
        print(f"For estimators={i}, depth={j}, log loss is: {loss_}")
best_index = np.argmin(loss_list)
best_alpha = alpha[int(best_index/4)]
best_depth = max_depth[int(best_index%4)]
clf = RandomForestClassifier(n_estimators=best_alpha, max_depth=best_depth, n_jobs=-1)
sig_clf = CalibratedClassifierCV(clf)
sig_clf.fit(train_onehot, y_train)

y_pred = sig_clf.predict_proba(train_onehot)
print('For values of best alpha:', best_alpha, ', best max-depth:', best_depth, "The train log loss is:", log_loss(y_train, y_pred))

y_pred = sig_clf.predict_proba(val_onehot)
print('For values of best alpha:', best_alpha, ', best max-depth:', best_depth, "The cross val log loss is:", log_loss(y_val, y_pred))

y_pred = sig_clf.predict_proba(test_onehot)
print('For values of best alpha:', best_alpha, ', best max-depth:', best_depth, "The test log loss is:", log_loss(y_test, y_pred,))

plot_confusion_matrix(y_test, sig_clf.predict(test_onehot))

5.7 模型集成


clf1 = MultinomialNB(alpha=0.1)
sig_clf1 = CalibratedClassifierCV(clf1)

clf2 = KNeighborsClassifier(n_neighbors=5)
sig_clf2 = CalibratedClassifierCV(clf2)

clf3 = SGDClassifier(alpha=0.001, loss='log', class_weight='balanced')
sig_clf3 = CalibratedClassifierCV(clf3)

clf4 = SGDClassifier(class_weight='balanced', alpha=0.001, loss='hinge')
sig_clf4 = CalibratedClassifierCV(clf4)

clf5 = RandomForestClassifier(n_estimators=500, max_depth=10, n_jobs=-1)
sig_clf5 = CalibratedClassifierCV(clf5)

alpha = [0.0001,0.001,0.01,0.1,1]
best_loss = 999
best_alpha = 999

for i in alpha:
    lr = LogisticRegression(C=i)
    sclf = StackingClassifier(classifiers=[sig_clf1, sig_clf2, sig_clf3, sig_clf4, sig_clf5],
                              meta_classifier=lr, use_probas=True)
    sclf.fit(train_onehot, y_train)
    loss_ = log_loss(y_val, sclf.predict_proba(val_onehot))
    print(f"Stacking classifiers for alpha={i}, log loss is: {loss_}")
    if best_loss > loss_:
        best_loss = loss_
        best_alpha = i
lr = LogisticRegression(C=best_alpha)
sclf = StackingClassifier(classifiers=[sig_clf1, sig_clf2, sig_clf3, sig_clf4, sig_clf5],
                          meta_classifier=lr, use_probas=True)
sclf.fit(train_onehot, y_train)

log_error = log_loss(y_train, sclf.predict_proba(train_onehot))
print("Log loss (train) on the stacking classifier:", log_error)

log_error = log_loss(y_val, sclf.predict_proba(val_onehot))
print("Log loss (CV) on the stacking classifier:", log_error)

log_error = log_loss(y_test, sclf.predict_proba(test_onehot))
print("Log loss (test) on the stacking classifier:", log_error)

plot_confusion_matrix(y_test, sclf.predict(test_onehot))

Original: https://blog.csdn.net/SunJW_2017/article/details/123252964
Author: 芳樽里的歌
Title: 基因变异自动分类

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/530432/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球