摘要
该博客记录了一个基于numpy实现线性回归的例子。与sklearn不同,numpy实现的多为梯度下降方式优化模型性能。以下为代码部分:定义RMSE,loss,r2函数,定义回归模型,标准化,可视化等。
Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
Define the RMSE, loss and R2 calculation functions
def rmse(y_test,pre):
return np.linalg.norm(y_test-pre, ord=2)/len(pre)**0.5
def R2(y_test,pre):
return 1 - np.sum((y_test-pre)**2) / np.sum((y_test - np.mean(y_test))**2)
def Loss(y_test,pre):
return np.sum((y_test-pre)**2)
def np_split_data(data1,n_split):
Y = data1.iloc[:,0]
X = data1.iloc[:,1:]
n = data1.shape[0]
training_idx = list(np.random.choice(range(n),int(n*n_split),replace=False))
testing_idx = list(set(training_idx)^set(range(n)))
x_train = X.iloc[training_idx,:]
y_train = Y.iloc[training_idx]
x_test = X.iloc[testing_idx,:]
y_test = Y.iloc[testing_idx]
x_train.index = range(x_train.shape[0])
y_train.index = range(x_train.shape[0])
x_test.index = range(x_test.shape[0])
y_test.index = range(x_test.shape[0])
return x_train,y_train,x_test,y_test
load data
path = r"/content/houseprices.csv"
data = pd.read_csv(path)
demo = data
demo
HomePriceSqFtBedroomsBathroomsOffersBrickNeighborhood011143001790222NoEast121142002030423NoEast231148001740321NoEast34947001980323NoEast451198002130333NoEast………………………1231241197001900333YesEast1241251479002160433YesEast1251261135002070222NoNorth1261271499002020331NoWest1271281246002250334NoNorth
128 rows × 8 columns
Different feature numerical interpolation is too large, standardized processing
from sklearn.preprocessing import StandardScaler
size_mapping_Neighborhood = {}
for i in range(len(data["Neighborhood"].value_counts().index)):
size_mapping_Neighborhood[data["Neighborhood"].value_counts().index[i]] = i
size_mapping_Brick = {}
for i in range(len(data["Brick"].value_counts().index)):
size_mapping_Brick[data["Brick"].value_counts().index[i]] = i
demo["Brick"] = demo["Brick"].map(size_mapping_Brick)
demo["Neighborhood"] = demo["Neighborhood"].map(size_mapping_Neighborhood)
demo = demo.iloc[:,1:]
ss = StandardScaler()
demo = ss.fit_transform(demo)
demo = pd.DataFrame(demo)
demo
01234560-0.602585-1.000916-1.415327-0.868939-0.542769-0.698836-1.1785381-0.6063210.1379041.350503-0.8689390.396075-0.698836-1.1785382-0.583903-1.238171-0.032412-0.868939-1.481614-0.698836-1.1785383-1.334923-0.099350-0.032412-0.8689390.396075-0.698836-1.1785384-0.3970820.612413-0.0324121.0823620.396075-0.698836-1.178538……………………123-0.400818-0.478957-0.0324121.0823620.3960751.430950-1.1785381240.6528510.7547651.3505031.0823620.3960751.430950-1.178538125-0.6324760.327707-1.415327-0.868939-0.542769-0.6988360.0579611260.7275800.090453-0.0324121.082362-1.481614-0.6988361.294459127-0.2177341.181823-0.0324121.0823621.334919-0.6988360.057961
128 rows × 7 columns
x_train,y_train,x_test,y_test = np_split_data(demo,0.8)
print("x_train shape is:",x_train.shape)
print("y_train shape is:",y_train.shape)
print("x_test shape is:",x_test.shape)
print("y_test shape is:",y_test.shape)
x_train shape is: (102, 6)
y_train shape is: (102,)
x_test shape is: (26, 6)
y_test shape is: (26,)
Define the model
class LinearRegression:
def __init__(self, x_train,y_train,x_test,y_test):
self.demo = data
self.x_train,self.y_train,self.x_test,self.y_test = x_train,y_train,x_test,y_test
self.w = np.ones(shape=(1,self.x_train.shape[1]))
self.b = np.array([[1]])
self.learningRate = 0.001
self.Loopnum = 5000
self.loss = []
def get_test_data(self):
return self.x_test,self.y_test
def predict(self,x):
predictions = np.dot(x,self.w.T)+self.b
return predictions
def train(self):
for num in range(self.Loopnum):
WXPlusB = np.dot(self.x_train, self.w.T) + self.b
self.y_train = np.array(self.y_train).reshape(WXPlusB.shape)
loss=np.dot((self.y_train-WXPlusB).T,self.y_train-WXPlusB)/self.y_train.shape[0]
self.loss.append(loss[0])
w_gradient = -(2/self.x_train.shape[0])*np.dot((self.y_train-WXPlusB).T,self.x_train)
baise_gradient = -2*np.sum(np.dot((self.x_train-WXPlusB).T,np.ones(shape=[self.x_train.shape[0],1])))/self.x_train.shape[0]
self.w=self.w-self.learningRate*w_gradient
self.b=self.b-self.learningRate*baise_gradient
def show_loss(self):
print(self.loss)
return
def draw(self):
plt.figure()
plt.plot(range(len(self.loss)-5),self.loss[5:])
plt.show()
return
Q1 = LinearRegression(x_train,y_train,x_test,y_test)
Q1.train()
Q1.draw()
x_test,y_test = Q1.get_test_data()
from sklearn.metrics import r2_score
pre = Q1.predict(x_test)
y_test = np.array(y_test).reshape(pre.shape)
Y_loss = np.sum((y_test-pre)**2)
RMSE = rmse(y_test,pre)
print("Detailed data presentation of Q1 model")
print("The test machine data house price forecast is",pre.reshape(1,len(pre)))
print("Loss value:",Y_loss)
print("RMSE:",RMSE)
print("R2 value: ", r2_score(Q1.y_train, Q1.predict(Q1.x_train)))
print("Weight vector:",Q1.w)
Detailed data presentation of Q1 model
The test machine data house price forecast is [[-9.18914099e-01 -4.36959638e-02 3.61875813e-01 6.61200258e-01
-3.70063242e-01 9.01867176e-01 1.66935978e-04 -6.96820499e-01
-2.24830766e-01 -6.58279255e-01 -1.23682554e-01 -6.66188042e-01
-5.18051463e-01 3.72984245e-01 -3.21099087e-01 2.52428836e-01
1.45079221e+00 -2.06799752e+00 1.85956363e+00 5.81677824e-01
8.88002999e-01 7.68281468e-01 -2.28030412e-01 3.12678491e+00
-2.14092056e-01 -4.92613761e-02]]
Loss value: 6.586735591166908
RMSE: 0.5033249291219841
R2 value: 0.7824062539612411
Weight vector: [[ 0.52005525 0.27768886 0.15460977 -0.51425199 0.26932374]]
plt.plot(range(26),y_test,label='Y')
plt.plot(range(26),pre,label='pre')
plt.title("Compare Y with pre")
plt.legend()
plt.show()
select the best model
import matplotlib.pyplot as plt
fig,ax = plt.subplots(nrows = 3,ncols = 2,figsize=(14,12))
ax[0][0].scatter(demo.iloc[:,1],demo.iloc[:,0])
ax[0][1].scatter(demo.iloc[:,2],demo.iloc[:,0])
ax[1][0].scatter(demo.iloc[:,3],demo.iloc[:,0])
ax[1][1].scatter(demo.iloc[:,4],demo.iloc[:,0])
ax[2][0].scatter(demo.iloc[:,5],demo.iloc[:,0])
ax[2][1].scatter(demo.iloc[:,6],demo.iloc[:,0])
ax[0][0].set_xlabel(data.columns[2])
ax[0][0].set_ylabel(data.columns[1])
ax[0][1].set_xlabel(data.columns[3])
ax[1][0].set_xlabel(data.columns[4])
ax[1][1].set_xlabel(data.columns[5])
ax[2][0].set_xlabel(data.columns[6])
ax[2][1].set_xlabel(data.columns[7])
plt.show()
demo0 =demo.drop(demo.columns[2], axis=1)
x_train,y_train,x_test,y_test = np_split_data(demo0,0.8)
Q2 = LinearRegression(x_train,y_train,x_test,y_test)
Q2.train()
x_test,y_test = Q2.get_test_data()
pre = Q2.predict(x_test)
y_test = np.array(y_test).reshape(pre.shape)
Y_loss = np.sum((y_test-pre)**2)
RMSE = rmse(y_test,pre)
print("Detailed data presentation of Q2 model")
print("The test machine data house price forecast is",pre.reshape(1,len(pre)))
print("Loss value:",Y_loss)
print("RMSE:",RMSE)
print("R2 value: ", r2_score(Q2.y_train, Q2.predict(Q2.x_train)))
print("Weight vector:",Q2.w)
Detailed data presentation of Q2 model
The test machine data house price forecast is [[-0.73152743 -1.27962581 0.0920423 -0.95137815 0.55296749 0.3941641
0.03079132 -0.83098956 0.24954459 -0.62426603 -1.43822556 0.15402153
-1.15424147 -0.50879432 1.65785734 1.43012849 -0.25162625 0.78827719
-0.72864504 1.20641756 0.05650989 -0.52213994 -1.15030241 -0.34219166
1.08861407 0.07005915]]
Loss value: 4.707307862404436
RMSE: 0.4255000615748141
R2 value: 0.8447121600566533
Weight vector: [[ 0.52402705 0.20488842 -0.40842359 0.32380326 0.35594341]]
· Sorting by RMSE,Q2 subset best model < Q1 linear regression model
· Sorting by R2,Q2 subset best model>Q1 linear regression model
Original: https://blog.csdn.net/pylittlebrat/article/details/127180021
Author: ACxz
Title: 纯numpy实现线性回归
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/762181/
转载文章受原作者版权保护。转载请注明原作者出处!