逻辑回归实现 - 损失未收敛及模型效果差
我正在尝试实现一个逻辑回归模型,也就是一个二分类器。
我需要使用随机梯度下降法,并且要用到二元交叉熵的梯度的封闭形式。
在尝试用数据训练模型后,发现模型似乎没有正常工作:
随着迭代次数的增加,损失值并没有减少,也没有收敛。只有在设置学习率eta为0.002时,才会有一点效果,但这之后如果我们把学习率设置得大很多(也就是eta>>0.002),再把它重置回0.002,模型可能才会收敛。
之后,评估模型的表现时,结果非常糟糕,和一个简单的模型差不多,这个简单模型会把所有测试样本都预测成同一个结果:
Confusion Matrix:
[[344. 240.]
[294. 322.]]
True Negatives (TN): 322.0
False Positives (FP): 294.0
False Negatives (FN): 240.0
True Positives (TP): 344.0
Sensitivity (Se): 0.589041095890411
Specificity (Sp): 0.5227272727272727
Positive Predictive Value (PPV): 0.5391849529780565
Negative Predictive Value (NPV): 0.5729537366548043
Accuracy (Acc): 0.555
F1 Score: 0.563011456628478
Area Under the ROC Curve (AUC): 0.555.
这个实现有什么问题呢?
import numpy as np
import matplotlib.pyplot as plt
def sigmoid(z):
sig = 1 / (1+np.exp(-z))
return sig
class ManualLogisticRegression:
def __init__(self, random_state=1):
np.random.seed(random_state)
self.w = np.random.randn(5)
def fit(self, X, Y, eta=0.005, plot=False):
if plot:
loss_vec = np.zeros(len(X))
for idx, (x, y) in enumerate(zip(X, Y)):
z = np.dot(x, self.w)
a = sigmoid(z)
grad = np.dot(x.T, (a - y))
self.w -= eta * grad
if plot:
loss_vec[idx] = self.log_loss(X, Y)
if plot:
plt.plot(loss_vec)
plt.xlabel('# of iterations')
plt.ylabel('Loss')
def log_loss(self, x, y):
z = np.dot(x, self.w)
p = sigmoid(z)
epsilon = 1e-5
p = np.clip(p, epsilon, 1 - epsilon)
log_loss = (-1 / len(x)) * np.sum(y * np.log(p) + (1 - y) * np.log(1 - p))
return log_loss
def predict_proba(self, x):
"""
This function computes the probability of every example in x to belong to the class "1" using the trained model.
:param x: Feature matrix (could be also a single vector).
:return: vector at the length of examples in x where every element is the probability to belong to class "1" per example.
"""
z = np.dot(x, self.w)
y_pred_proba = sigmoid(z)
return y_pred_proba
def predict(self, x, thresh=0.5):
"""
This function labels every example according to the calculated probability with the use of a threshold.
:param x: Feature matrix (could be also a single vector).
:param thresh: decision threshold.
:return: vector at the length of examples in x where every element is the estimated label (0 or 1) per example.
"""
z = np.dot(x, self.w)
probabilities = sigmoid(z)
y_pred = np.where(probabilities >= thresh, 1, 0)
return y_pred
def score(self, x, y):
"""
This function computes the accuracy of the trained model's estimations.
:param x: Feature matrix (could be also a single vector).
:param y: Adequate true labels (either 1 or 0).
:return: Estimator's accuracy.
"""
return np.sum(self.predict(x) == y)/len(y)
def conf_matrix(self, x, y):
"""
This function computes the confusion matrix for the prediction of the trained model. First value of the matrix
was given as a hint.
:param x: Feature matrix (could be also a single vector).
:param y: Adequate true labels (either 1 or 0).
:return: Confusion matrix.
"""
conf_mat = np.zeros((2, 2))
y_pred = self.predict(x)
conf = (y_pred == y)
conf_mat[0, 0] += np.sum(1 * (conf[y_pred == 0] == 1))
#the code provided is checking if the prediction is matching a true positive case, so it is
#calculating the number of TN,if y==0 we get True, if y==1 we get False.
conf_mat[1, 0] += np.sum(1 * (conf[y_pred == 0] == 0)) # FN
conf_mat[0, 1] += np.sum(1 * (conf[y_pred == 1] == 0)) # FP
conf_mat[1, 1] += np.sum(1 * (conf[y_pred == 1] == 1)) # TP
# --------------------------------------------------------------------------------------
return conf_mat
这是笔记本的内容:
%load_ext autoreload
%autoreload 2
from manual_log_reg import ManualLogisticRegression
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
X = pd.read_csv('X_data.csv')
X.drop(columns=X.columns[0], axis=1, inplace=True)
X.head()
y = pd.read_csv('y_data.csv') # read and convert to numpy
y.drop(columns=y.columns[0], axis=1, inplace=True)
y.head()
X = X.values # convert to numpy
y = y.values.astype(int).flatten() # convert to numpy integers and flatten
X = np.concatenate((np.ones((len(y), 1)), X), axis=1) # add bias term
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
log_reg = ManualLogisticRegression()
log_reg.fit(X_train, y_train, eta=0.003, plot=True)
sorted_weights = np.sort(np.abs(log_reg.w[:-1])) # Exclude bias term and sort by absolute value
most_important_feature_index = np.argmax(np.abs(log_reg.w[:-1])) # Find the index of the most important feature
most_important_feature_weight = log_reg.w[most_important_feature_index] # Get the weight of the most important feature
print(f"The most important feature is feature {most_important_feature_index + 1} with weight {most_important_feature_weight}.")
conf_matrix = log_reg.conf_matrix(X_test, y_test)
print("Confusion Matrix:")
print(conf_matrix)
# Calculate additional performance metrics
TN = conf_matrix[1, 1]
FP = conf_matrix[1, 0]
FN = conf_matrix[0, 1]
TP = conf_matrix[0, 0]
Se = TP / (TP + FN)
Sp = TN / (TN + FP)
PPV = TP / (TP + FP)
NPV = TN / (TN + FN)
Acc = (TP + TN) / (TP + TN + FP + FN)
F1 = 2 * (PPV * Se) / (PPV + Se)
# Calculate AUC using the score method of ManualLogisticRegression
AUC = log_reg.score(X_test, y_test)
# Report the performance metrics
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")
print(f"True Positives (TP): {TP}")
print(f"Sensitivity (Se): {Se}")
print(f"Specificity (Sp): {Sp}")
print(f"Positive Predictive Value (PPV): {PPV}")
print(f"Negative Predictive Value (NPV): {NPV}")
print(f"Accuracy (Acc): {Acc}")
print(f"F1 Score: {F1}")
print(f"Area Under the ROC Curve (AUC): {AUC}")
conf_mat = log_reg.conf_matrix(X_test, y_test)
import seaborn as sns
import matplotlib.pyplot as plt
# Plot confusion matrix
sns.heatmap(conf_mat, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()
1 个回答
0
你实现中可能最大的问题是关于导数的。对向量w的导数其实应该是一个向量,而不是一个标量。计算梯度的正确方法是:
grad = (a - y) * x
其次,只训练模型一个周期(epoch)似乎不太合理。你可以尝试运行多个周期,方法是添加:
def fit(self, X, Y, epochs=10, eta=0.005, plot=False):
if plot:
loss_vec = np.zeros(epochs)
for epoch in range(epochs):
for (x, y) in zip(X, Y):
z = np.dot(x, self.w)
a = sigmoid(z)
grad = (a - y) * x
self.w -= eta * grad
epoch_loss += self.log_loss(
if plot:
loss_vec[epoch] = self.log_loss(X, Y)
if plot:
plt.plot(loss_vec)
plt.xlabel('# of iterations')
plt.ylabel('Loss')