# 参考链接

https://blog.csdn.net/weixin_46344368/article/details/107214449?spm=1001.2014.3001.5502

# code

#加载数据
import pandas as pd
import numpy as np
#define X and y
X = data.drop(['y'],axis=1)
y = data.loc[:,'y']
#可视化数据
%matplotlib inline
from matplotlib import pyplot as plt
fig1 = plt.figure(figsize=(10,10))
good = plt.scatter(X.loc[:,'x1'][y==1],X.loc[:,'x2'][y==1])
plt.title('raw data')
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()

#异常检测
from sklearn.covariance import EllipticEnvelope

#可视化异常点
fig2 = plt.figure(figsize=(5,5))
good = plt.scatter(X.loc[:,'x1'][y==1],X.loc[:,'x2'][y==1])
plt.title('raw data')
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()

#define X and y
X = data.drop(['y'],axis=1)
y = data.loc[:,'y']

#pca
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
X_norm = StandardScaler().fit_transform(X)
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X_norm)
var_ratio = pca.explained_variance_ratio_
print(var_ratio)
fig4 = plt.figure(figsize=(5,5))
plt.bar([1,2],var_ratio)
plt.show()

#数据分离
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=4,test_size=0.4)
print(X_train.shape,X_test.shape,X.shape)

#knn 模型
from sklearn.neighbors import KNeighborsClassifier
knn_10=KNeighborsClassifier(n_neighbors=10)
knn_10.fit(X_train,y_train)
y_train_predict=knn_10.predict(X_train)
y_test_predict=knn_10.predict(X_test)

#计算准确率
from sklearn.metrics import accuracy_score
accuracy_train=accuracy_score(y_train,y_train_predict)
accuracy_test=accuracy_score(y_test,y_test_predict)
print("training accuracy:",accuracy_train)
print("testing accuracy:",accuracy_test)

xx,yy=np.meshgrid(np.arange(0,10,0.05),np.arange(0,10,0.05)) #生成对应的数据组合
print(yy.shape)

x_range = np.c_[xx.ravel(),yy.ravel()] #转换成两列
print(x_range.shape)

y_range_predict=knn_10.predict(x_range)

fig4 = plt.figure(figsize=(10,10))
knn_good = plt.scatter(x_range[:,0][y_range_predict==1],x_range[:,1][y_range_predict==1])

good = plt.scatter(X.loc[:,'x1'][y==1],X.loc[:,'x2'][y==1])
plt.title('prediction result')
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_test_predict)
print(cm)

TP = cm[1,1]
TN = cm[0,0]
FP = cm[0,1]
FN = cm[1,0]
print(TP,TN,FP,FN)

accuracy = (TP + TN)/(TP + TN + FP + FN)
print(accuracy)

recall = TP/(TP + FN)
print(recall)

specificity = TN/(TN + FP)
print(specificity)

precision = TP/(TP + FP)
print(precision)

F1 = 2*precision*recall/(precision+recall)
print(F1)

#尝试不同的n_neighbors（1-20）,计算其在训练数据集、测试数据集上的准确率并作图
n = [i for i in range(1,21)]
accuracy_train = []
accuracy_test = []
for i in n:
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
y_train_predict = knn.predict(X_train)
y_test_predict = knn.predict(X_test)
accuracy_train_i = accuracy_score(y_train,y_train_predict)
accuracy_test_i = accuracy_score(y_test,y_test_predict)
accuracy_train.append(accuracy_train_i)
accuracy_test.append(accuracy_test_i)
print(accuracy_train,accuracy_test)

fig5 = plt.figure(figsize=(12,5))
plt.subplot(121)
plt.plot(n,accuracy_train,marker='o')
plt.title('training accuracy vs n_neighbors')
plt.xlabel('n_neighbors')
plt.ylabel('accuracy')
plt.subplot(122)
plt.plot(n,accuracy_test,marker='o')
plt.title('testing accuracy vs n_neighbors')
plt.xlabel('n_neighbors')
plt.ylabel('accuracy')
plt.show()


# image

Hope is a good thing,maybe the best of things,and no good thing ever dies.———– Andy Dufresne

Original: https://www.cnblogs.com/eat-too-much/p/16191166.html
Author: HDU李少帅
Title: 实战（二） 机器学习之数据分离与混淆矩阵

(0)