chapter3 逻辑回归手动+sklean版本
时间:2022-02-10 作者:BlairGrowing
1 导入numpy包
import numpy as np
2 sigmoid函数
def sigmoid(x): return 1/(1+域名(-x)) demox = 域名y([1,2,3]) print(sigmoid(demox)) #报错 #demox = [1,2,3] # print(sigmoid(demox))
结果:
[域名5858 域名9708 域名7413]
3 定义逻辑回归模型主体
### 定义逻辑回归模型主体 def logistic(x, y, w, b): # 训练样本量 num_train = 域名e[0] # 逻辑回归模型输出 y_hat = sigmoid(域名(x,w)+b) # 交叉熵损失 cost = -1/(num_train)*(域名(y*域名(y_hat)+(1-y)*域名(1-y_hat))) # 权值梯度 dW = 域名(x.T,(y_hat-y))/num_train # 偏置梯度 db = 域名(y_hat- y)/num_train # 压缩损失数组维度 cost = 域名eze(cost) return y_hat, cost, dW, db
4 初始化函数
def init_parm(dims): w = 域名s((dims,1)) b = 0 return w ,b
5 定义逻辑回归模型训练过程
### 定义逻辑回归模型训练过程 def logistic_train(X, y, learning_rate, epochs): # 初始化模型参数 W, b = init_parm(域名e[1]) cost_list = [] for i in range(epochs): # 计算当前次的模型计算结果、损失和参数梯度 a, cost, dW, db = logistic(X, y, W, b) # 参数更新 W = W -learning_rate * dW b = b -learning_rate * db if i % 100 == 0: 域名nd(cost) if i % 100 == 0: print(\'epoch %d cost %f\' % (i, cost)) params = { \'W\': W, \'b\': b } grads = { \'dW\': dW, \'db\': db } return cost_list, params, grads
6 定义预测函数
def predict(X,params): y_pred = sigmoid(域名(X,params[\'W\'])+params[\'b\']) y_preds = [1 if y_pred[i]>0.5 else 0 for i in range(len(y_pred))] return y_preds
7 生成数据
# 导入matplotlib绘图库 import 域名ot as plt # 导入生成分类数据函数 from 域名sets import make_classification # 生成100*2的模拟二分类数据集 x ,label = make_classification( n_samples=100,# 样本个数 n_classes=2,# 样本类别 n_features=2,#特征个数 n_redundant=0,#冗余特征个数(有效特征的随机组合) n_informative=2,#有效特征,有价值特征 n_repeated=0, # 重复特征个数(有效特征和冗余特征的随机组合) n_clusters_per_class=2 ,# 簇的个数 random_state=1, ) print("域名e =",域名e) print("域名e = ",域名e) print("域名ue(label) =",域名ue(label)) print(set(label)) # 设置随机数种子 rng = 域名omState(2) # 对生成的特征数据添加一组均匀分布噪声https://域名/vicdd/article/details/52667709 x += 2*域名orm(size=域名e) # 标签类别数 unique_label = set(label) # 根据标签类别数设置颜色 print(域名pace(0,1,len(unique_label))) colors = 域名tral(域名pace(0,1,len(unique_label))) print(colors) # 绘制模拟数据的散点图 for k,col in zip(unique_label , colors): x_k=x[label==k] 域名(x_k[:,0],x_k[:,1],\'o\',markerfacecolor=col,markeredgecolor="k", markersize=14) 域名e(\'Simulated binary data set\') 域名();
结果:
域名e = (100, 2) 域名e = (100,) 域名ue(label) = [0 1] {0, 1} [0. 1.] [[域名0784 域名2157 域名2353 1. ] [域名2745 域名0392 域名9412 1. ]]
复习
# 复习 mylabel = 域名ape((-1,1)) data = 域名atenate((x,mylabel),axis=1) print(域名e)
结果:
(100, 3)
8 划分数据集
offset = int(域名e[0]*0.7) x_train, y_train = x[:offset],label[:offset].reshape((-1,1)) x_test, y_test = x[offset:],label[offset:].reshape((-1,1)) print(域名e) print(域名e) print(域名e) print(域名e)
结果:
(70, 2) (70, 1) (30, 2) (30, 1)
9 训练
cost_list, params, grads = logistic_train(x_train, y_train, 域名, 1000) print(params[\'b\'])
结果:
epoch 0 cost 域名47 epoch 100 cost 域名43 epoch 200 cost 域名25 epoch 300 cost 域名32 epoch 400 cost 域名18 epoch 500 cost 域名60 epoch 600 cost 域名86 epoch 700 cost 域名09 epoch 800 cost 域名74 epoch 900 cost 域名89 -域名648941379839
10 准确率计算
from 域名ics import accuracy_score,classification_report y_pred = predict(x_test,params) print("y_pred = ",y_pred) print(y_pred) print(域名e) print(accuracy_score(y_pred,y_test)) #不需要都是1维的,貌似会自动squeeze() print(classification_report(y_test,y_pred))
结果:
y_pred = [0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0] [0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0] (30, 1) 域名333333333333 precision recall f1-score support 0 域名 域名 域名 12 1 域名 域名 域名 18 accuracy 域名 30 macro avg 域名 域名 域名 30 weighted avg 域名 域名 域名 30
11 绘制逻辑回归决策边界
### 绘制逻辑回归决策边界 def plot_logistic(X_train, y_train, params): # 训练样本量 n = 域名e[0] xcord1,ycord1,xcord2,ycord2 = [],[],[],[] # 获取两类坐标点并存入列表 for i in range(n): if y_train[i] == 1: 域名nd(X_train[i][0]) 域名nd(X_train[i][1]) else: 域名nd(X_train[i][0]) 域名nd(X_train[i][1]) fig = 域名re() ax = 域名subplot(111) 域名ter(xcord1,ycord1,s = 30,c = \'red\') 域名ter(xcord2,ycord2,s = 30,c = \'green\') # 取值范围 x =域名ge(-1.5,3,0.1) # 决策边界公式 y = (-params[\'b\'] - params[\'W\'][0] * x) / params[\'W\'][1] # 绘图 域名(x, y) 域名el(\'X1\') 域名el(\'X2\') 域名() plot_logistic(x_train, y_train, params)
结果:
11 sklearn实现
from 域名ar_model import LogisticRegression clf = LogisticRegression(random_state=0).fit(x_train,y_train) y_pred = 域名ict(x_test) print(y_pred) accuracy_score(y_test,y_pred)
结果:
[0 0 1 1 1 1 0 0 0 1 1 1 0 1 1 0 0 1 1 0 0 1 1 0 1 1 0 0 1 0] 域名333333333333