【机器学习】分类问题全流程

机器学习分类问题全流程

文章目录

冀以尘雾之微补益山海,荧烛末光增辉日月。 ——2022/1/27

导入类库

导入可视化模块、机器学习库中的模型评估模块和模型库模块;以方便后续使用。

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pandas import read_csv
from pandas.plotting import scatter_matrix
from pandas import set_option
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

导入数据

将要用的数据集导入,本文以玻璃数据集glass_train.csv为例。


filename = 'glass_train.csv'
dataset = read_csv(filename, header=0)
tag = read_csv('glass_train_labels.csv', header=0)
df=pd.concat([dataset,tag],axis=1)
df.columns=['RI','Na','Mg','AI','Si','K','Ca','Ba','Fe','tag']
df

数据探查与预处理

这里主要介绍简单的数据处理,对于复杂的数据集需要对数据做特征工程进行进一步的处理。


print(df .shape)

df.describe()

df.dtypes

df.isna().sum()

print(tag.value_counts())

df.corr(method='pearson')

sns.heatmap(df.corr())
plt.show()

sns.kdeplot(df["tag"] , color="#1874CD",label="tag_label", alpha=.7)
plt.grid()

plt.show()

array = df.values
X = array[:, 0:9].astype(float)
Y = array[:, 9]
validation_size = 0.2
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed)

算法审查-训练模型

确定评估算法的基准,利用传统的机器学习算法进行训练并比较训练结果


num_folds = 10
seed = 7
scoring = 'accuracy'

models = {}
models['LR'] = LogisticRegression()
models['LDA'] = LinearDiscriminantAnalysis()
models['KNN'] = KNeighborsClassifier()
models['CART'] = DecisionTreeClassifier()
models['SVM'] = SVC()
results = []
for key in models:
    kfold = KFold(n_splits=num_folds, random_state=seed,shuffle=True)
    cv_results = cross_val_score(models[key], X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    print('%s : %f (%f)' % (key, cv_results.mean(), cv_results.std()))

fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(models.keys())
pyplot.show()


pipelines = {}
pipelines['ScalerLR'] = Pipeline([('Scaler', StandardScaler()), ('LR', LogisticRegression())])
pipelines['ScalerLDA'] = Pipeline([('Scaler', StandardScaler()), ('LDA', LinearDiscriminantAnalysis())])
pipelines['ScalerKNN'] = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsClassifier())])
pipelines['ScalerCART'] = Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeClassifier())])
pipelines['ScalerNB'] = Pipeline([('Scaler', StandardScaler()), ('NB', GaussianNB())])

results = []
for key in pipelines:
    kfold = KFold(n_splits=num_folds, random_state=seed,shuffle=True)
    cv_results = cross_val_score(pipelines[key], X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    print('%s : %f (%f)' % (key, cv_results.mean(), cv_results.std()))

fig = plt.figure()
fig.suptitle('Scaled Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(pipelines.keys())
plt.show()


ensembles = {}
ensembles['ScaledAB'] = Pipeline([('Scaler', StandardScaler()), ('AB', AdaBoostClassifier())])
ensembles['ScaledGBM'] = Pipeline([('Scaler', StandardScaler()), ('GBM', GradientBoostingClassifier())])
ensembles['ScaledRF'] = Pipeline([('Scaler', StandardScaler()), ('RFR', RandomForestClassifier())])
ensembles['ScaledET'] = Pipeline([('Scaler', StandardScaler()), ('ETR', ExtraTreesClassifier())])

results = []
for key in ensembles:
    kfold = KFold(n_splits=num_folds, random_state=seed,shuffle=True)
    cv_result = cross_val_score(ensembles[key], X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_result)
    print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))

fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(ensembles.keys())
plt.show()

通过以上结果进行比较分析,得到结果最好的模型。

模型调参


from sklearn.ensemble import GradientBoostingClassifier
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900]}
model = GradientBoostingClassifier()
kfold = KFold(n_splits=num_folds,shuffle=True, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)
print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))

模型最终化&模型评估

根据上述的模型结果选择最佳的模型,以下只是一个参考。


from lightgbm import LGBMClassifier
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
model = LGBMClassifier()
model.fit(X=rescaledX, y=Y_train)

rescaled_validationX = scaler.transform(X_validation)
predictions = model.predict(rescaled_validationX)
print(predictions)

a = pd.DataFrame()
a['预测值'] = list(predictions)
a['实际值'] = list(Y_validation)
a.head()

features = ['RI','Na','Mg','AI','Si','K','Ca','Ba','Fe']
importances = model.feature_importances_

importances_df = pd.DataFrame()
importances_df['特征名称'] = features
importances_df['特征重要性'] = importances
importances_df.sort_values('特征重要性', ascending=False)

rescaled_validationX = scaler.transform(X_validation)
predictions = model.predict(rescaled_validationX)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

参考书籍:Python机器学习实战案例_赵卫东,董亮编著_2019.12
全书源代码:https://github.com/weizy1981/MachineLearning

本文链接:https://blog.csdn.net/qq_46426207/article/details/122723777

Original: https://blog.csdn.net/qq_46426207/article/details/122723777
Author: 穻易yuyee
Title: 【机器学习】分类问题全流程

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/664467/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球