【超详细】逻辑回归之kaggle糖尿病预测实战

逻辑回归——kaggle糖尿病预测实战
【实验所需数据私聊可发】

1、糖尿病是一组以高血糖为特征的代谢性疾病,由于胰岛素分泌缺陷或其生物作用受损则引起高血糖。长期存在的高血糖,会导致身体各种组织,特别是眼、肾、心脏、血管、神经的慢性损害和功能障碍。
2、通过2小时血浆葡萄糖浓度、2小时血清胰岛素、身体质量指数等特征来预测某个人是否罹患糖尿病,在众名的因素中,找到最能导致该病的关键特征。

1.1#数据导入

import warnings
warnings.filterwarnings('ignore')
data=np.loadtxt(r"D:\pima-indians-diabetes.data.csv",delimiter=",",skiprows=1,dtype=np.float)
data

运行结果:

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

1.2#分离特征变量和分类变量

X=data[:,:-1]
y=data[:,-1]

1.3#特征标准化

mu=X.mean(axis=0)
std=X.std(axis=0)
X=(X-mu)/std

1.4#添加全1列


x_ones=np.ones((X.shape[0],1))
X=np.hstack((X,x_ones))

1.5#拆分数据

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=8)

1.6#将因变量转为列向量

y_train=y_train.reshape(-1,1)
y_test=y_test.reshape(-1,1)
print(y_train.shape,y_test.shape)

结果:

(537, 1) (231, 1)

1.7#初始化theta值

theta=np.ones([X_train.shape[1],1])
theta

结果:

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.]])

1.8#设置步长值

alpha=0.001

1.9#定义sigmoid函数

def sigmoid(z):
    s=1.0/(1+np.exp(-z))
    return s
num_iters=10000
m=200
for i in range(num_iters):
    h=sigmoid(np.dot(X_train,theta))
    theta=theta-alpha*np.dot(X_train.T,(h-y_train))/m
print(theta)

结果为:

[[ 0.39210287]
 [ 1.10657783]
 [-0.24092243]
 [ 0.0223229 ]
 [-0.17137676]
 [ 0.61819121]
 [ 0.45880179]
 [ 0.12971106]
 [-0.84498429]]

1.10#预测

pred_y=sigmoid(np.dot(X_test,theta))

1.11#预测结果二值化

pred_y[pred_y>0.5]=1
pred_y[pred_y0.5]=0
print(pred_y.reshape(1,-1))

结果为:

[[0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1.

  0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0.

  0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

  0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0.

  0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.

  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1.

  0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.

  1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0.

  0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0.

  0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0.]]
print(y_test.reshape(1,-1))

结果为:

[[0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.

  0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0.

  0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1.

  0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.

  0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.

  1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1.

  0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0.

  1. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1.

  1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.

  0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1.]]

1.12#预测准确率:

print("预测准确率为:",np.sum(pred_y==y_test)/len(y_test))

结果为:

预测准确率为: 0.7878787878787878

【sklearn 实现逻辑回归】:
2.1#导入数据

data = np.loadtxt(r"D:\pima-indians-diabetes.data.csv",deliniter = ",",skiprows = 1,dtype = np.float)

2.2#分离特征变量和分类变量

X = data[:,:-1]
y = data[:,-1]

2.3#特征标准化

mu = X.mean(axis = 0)
std = X.std(axis = 0)
X = (X - mu) / std

2.4#拆分训练集和测试集

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 8
from sklearn.linear_model import LogisticRegression

logist=LogisticRegression()

logist.fit(X_train,y_train)

y_predict=logist.predict(X_test)
print(y_predict)

结果为:

[0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1.

 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0.

 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0.

 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.

 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1.

 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.

 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0.

 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0.

 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0.]

计算模型准确率

print("准确率:",np.sum((y_predict==y_test))/len(y_test))

结果为:

准确率: 0.7792207792207793

Original: https://blog.csdn.net/weixin_50989751/article/details/123798177
Author: 笑裹群鏖
Title: 【超详细】逻辑回归之kaggle糖尿病预测实战

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/690614/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球