纯numpy实现线性回归

摘要

该博客记录了一个基于numpy实现线性回归的例子。与sklearn不同,numpy实现的多为梯度下降方式优化模型性能。以下为代码部分:定义RMSE,loss,r2函数,定义回归模型,标准化,可视化等。

Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Define the RMSE, loss and R2 calculation functions


def rmse(y_test,pre):
    return  np.linalg.norm(y_test-pre, ord=2)/len(pre)**0.5
def R2(y_test,pre):
    return 1 - np.sum((y_test-pre)**2) / np.sum((y_test - np.mean(y_test))**2)
def Loss(y_test,pre):
    return np.sum((y_test-pre)**2)
def np_split_data(data1,n_split):
    Y = data1.iloc[:,0]
    X = data1.iloc[:,1:]
    n = data1.shape[0]
    training_idx = list(np.random.choice(range(n),int(n*n_split),replace=False))

    testing_idx = list(set(training_idx)^set(range(n)))

    x_train = X.iloc[training_idx,:]
    y_train  = Y.iloc[training_idx]
    x_test = X.iloc[testing_idx,:]
    y_test  = Y.iloc[testing_idx]
    x_train.index = range(x_train.shape[0])
    y_train.index = range(x_train.shape[0])
    x_test.index = range(x_test.shape[0])
    y_test.index = range(x_test.shape[0])

    return x_train,y_train,x_test,y_test

load data


path = r"/content/houseprices.csv"
data = pd.read_csv(path)
demo = data
demo

HomePriceSqFtBedroomsBathroomsOffersBrickNeighborhood011143001790222NoEast121142002030423NoEast231148001740321NoEast34947001980323NoEast451198002130333NoEast………………………1231241197001900333YesEast1241251479002160433YesEast1251261135002070222NoNorth1261271499002020331NoWest1271281246002250334NoNorth

128 rows × 8 columns

Different feature numerical interpolation is too large, standardized processing

from sklearn.preprocessing import StandardScaler

size_mapping_Neighborhood = {}
for i in range(len(data["Neighborhood"].value_counts().index)):
    size_mapping_Neighborhood[data["Neighborhood"].value_counts().index[i]] = i
size_mapping_Brick = {}
for i in range(len(data["Brick"].value_counts().index)):
    size_mapping_Brick[data["Brick"].value_counts().index[i]] = i
demo["Brick"] = demo["Brick"].map(size_mapping_Brick)
demo["Neighborhood"] = demo["Neighborhood"].map(size_mapping_Neighborhood)
demo = demo.iloc[:,1:]

ss = StandardScaler()
demo = ss.fit_transform(demo)
demo = pd.DataFrame(demo)
demo

01234560-0.602585-1.000916-1.415327-0.868939-0.542769-0.698836-1.1785381-0.6063210.1379041.350503-0.8689390.396075-0.698836-1.1785382-0.583903-1.238171-0.032412-0.868939-1.481614-0.698836-1.1785383-1.334923-0.099350-0.032412-0.8689390.396075-0.698836-1.1785384-0.3970820.612413-0.0324121.0823620.396075-0.698836-1.178538……………………123-0.400818-0.478957-0.0324121.0823620.3960751.430950-1.1785381240.6528510.7547651.3505031.0823620.3960751.430950-1.178538125-0.6324760.327707-1.415327-0.868939-0.542769-0.6988360.0579611260.7275800.090453-0.0324121.082362-1.481614-0.6988361.294459127-0.2177341.181823-0.0324121.0823621.334919-0.6988360.057961

128 rows × 7 columns

x_train,y_train,x_test,y_test = np_split_data(demo,0.8)
print("x_train shape is:",x_train.shape)
print("y_train shape is:",y_train.shape)
print("x_test shape is:",x_test.shape)
print("y_test shape is:",y_test.shape)
x_train shape is: (102, 6)
y_train shape is: (102,)
x_test shape is: (26, 6)
y_test shape is: (26,)

Define the model


class LinearRegression:
    def __init__(self, x_train,y_train,x_test,y_test):

        self.demo = data
        self.x_train,self.y_train,self.x_test,self.y_test = x_train,y_train,x_test,y_test
        self.w = np.ones(shape=(1,self.x_train.shape[1]))

        self.b = np.array([[1]])
        self.learningRate = 0.001
        self.Loopnum = 5000
        self.loss = []

    def get_test_data(self):
        return self.x_test,self.y_test

    def predict(self,x):
        predictions = np.dot(x,self.w.T)+self.b
        return predictions

    def train(self):
        for num in range(self.Loopnum):

            WXPlusB = np.dot(self.x_train, self.w.T) + self.b

            self.y_train = np.array(self.y_train).reshape(WXPlusB.shape)
            loss=np.dot((self.y_train-WXPlusB).T,self.y_train-WXPlusB)/self.y_train.shape[0]
            self.loss.append(loss[0])
            w_gradient = -(2/self.x_train.shape[0])*np.dot((self.y_train-WXPlusB).T,self.x_train)

            baise_gradient = -2*np.sum(np.dot((self.x_train-WXPlusB).T,np.ones(shape=[self.x_train.shape[0],1])))/self.x_train.shape[0]
            self.w=self.w-self.learningRate*w_gradient
            self.b=self.b-self.learningRate*baise_gradient

    def show_loss(self):
        print(self.loss)
        return

    def draw(self):
        plt.figure()
        plt.plot(range(len(self.loss)-5),self.loss[5:])
        plt.show()
        return

Q1 = LinearRegression(x_train,y_train,x_test,y_test)
Q1.train()
Q1.draw()

纯numpy实现线性回归
x_test,y_test = Q1.get_test_data()
from sklearn.metrics import r2_score
pre = Q1.predict(x_test)
y_test = np.array(y_test).reshape(pre.shape)
Y_loss = np.sum((y_test-pre)**2)

RMSE = rmse(y_test,pre)
print("Detailed data presentation of Q1 model")
print("The test machine data house price forecast is",pre.reshape(1,len(pre)))
print("Loss value:",Y_loss)
print("RMSE:",RMSE)
print("R2 value: ", r2_score(Q1.y_train, Q1.predict(Q1.x_train)))
print("Weight vector:",Q1.w)
Detailed data presentation of Q1 model
The test machine data house price forecast is [[-9.18914099e-01 -4.36959638e-02  3.61875813e-01  6.61200258e-01
  -3.70063242e-01  9.01867176e-01  1.66935978e-04 -6.96820499e-01
  -2.24830766e-01 -6.58279255e-01 -1.23682554e-01 -6.66188042e-01
  -5.18051463e-01  3.72984245e-01 -3.21099087e-01  2.52428836e-01
   1.45079221e+00 -2.06799752e+00  1.85956363e+00  5.81677824e-01
   8.88002999e-01  7.68281468e-01 -2.28030412e-01  3.12678491e+00
  -2.14092056e-01 -4.92613761e-02]]
Loss value: 6.586735591166908
RMSE: 0.5033249291219841
R2 value:  0.7824062539612411
Weight vector: [[ 0.52005525  0.27768886  0.15460977 -0.51425199  0.26932374]]

plt.plot(range(26),y_test,label='Y')
plt.plot(range(26),pre,label='pre')
plt.title("Compare Y with pre")
plt.legend()
plt.show()

纯numpy实现线性回归

select the best model

import matplotlib.pyplot as plt
fig,ax = plt.subplots(nrows = 3,ncols = 2,figsize=(14,12))
ax[0][0].scatter(demo.iloc[:,1],demo.iloc[:,0])
ax[0][1].scatter(demo.iloc[:,2],demo.iloc[:,0])
ax[1][0].scatter(demo.iloc[:,3],demo.iloc[:,0])
ax[1][1].scatter(demo.iloc[:,4],demo.iloc[:,0])
ax[2][0].scatter(demo.iloc[:,5],demo.iloc[:,0])
ax[2][1].scatter(demo.iloc[:,6],demo.iloc[:,0])
ax[0][0].set_xlabel(data.columns[2])
ax[0][0].set_ylabel(data.columns[1])
ax[0][1].set_xlabel(data.columns[3])
ax[1][0].set_xlabel(data.columns[4])
ax[1][1].set_xlabel(data.columns[5])
ax[2][0].set_xlabel(data.columns[6])
ax[2][1].set_xlabel(data.columns[7])
plt.show()

纯numpy实现线性回归
demo0 =demo.drop(demo.columns[2], axis=1)
x_train,y_train,x_test,y_test = np_split_data(demo0,0.8)
Q2 = LinearRegression(x_train,y_train,x_test,y_test)
Q2.train()
x_test,y_test = Q2.get_test_data()
pre = Q2.predict(x_test)
y_test = np.array(y_test).reshape(pre.shape)
Y_loss = np.sum((y_test-pre)**2)
RMSE = rmse(y_test,pre)

print("Detailed data presentation of Q2 model")
print("The test machine data house price forecast is",pre.reshape(1,len(pre)))
print("Loss value:",Y_loss)
print("RMSE:",RMSE)
print("R2 value: ", r2_score(Q2.y_train, Q2.predict(Q2.x_train)))
print("Weight vector:",Q2.w)
Detailed data presentation of Q2 model
The test machine data house price forecast is [[-0.73152743 -1.27962581  0.0920423  -0.95137815  0.55296749  0.3941641
   0.03079132 -0.83098956  0.24954459 -0.62426603 -1.43822556  0.15402153
  -1.15424147 -0.50879432  1.65785734  1.43012849 -0.25162625  0.78827719
  -0.72864504  1.20641756  0.05650989 -0.52213994 -1.15030241 -0.34219166
   1.08861407  0.07005915]]
Loss value: 4.707307862404436
RMSE: 0.4255000615748141
R2 value:  0.8447121600566533
Weight vector: [[ 0.52402705  0.20488842 -0.40842359  0.32380326  0.35594341]]

· Sorting by RMSE,Q2 subset best model < Q1 linear regression model

· Sorting by R2,Q2 subset best model>Q1 linear regression model

Original: https://blog.csdn.net/pylittlebrat/article/details/127180021
Author: ACxz
Title: 纯numpy实现线性回归

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/762181/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球