关于Logistic回归的几个问题

from numpy import * import matplotlib.pyplot as plt def loaddataset(): dataMat = []; labelMat = [] frX = open('../ex4x.dat') frY = open('../ex4y.dat') for line1 in frX.readlines(): lineArr1 = line1.strip().split() dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])]) for line2 in frY.readlines(): lineArr2 = line2.strip().split() labelMat.append(float(lineArr2[0])) return dataMat,labelMat def sigmoid(inX): return 1.0/(1+exp(-inX)) # def autoNorm(dataSet): # # newValue = (oldValue-min)/(max-min) # minVals = min(dataSet) # maxVals = max(dataSet) # ranges = list(map(lambda x: x[0]-x[1], zip(maxVals, minVals))) # normDataSet = zeros(shape(dataSet)) # m,n = shape(dataSet) # normDataSet = list(map(lambda x: x[0]-x[1], zip(dataSet,tile(minVals, (m,1))))) # normDataSet = normDataSet/tile(ranges, (m,1)) # return normDataSet, ranges, minVals def gradDescent(dataMatIn, classLabels): x = mat(dataMatIn) y = mat(classLabels).transpose() m,n = shape(x) alpha = 0.001 maxCycles = 100000 theta = ones((n,1)) for k in range(maxCycles): h = sigmoid(x*theta) error = h - y cost = -1*dot(log(h).T,y)-dot((1-y).T,log(1-h)) print("Iteration %d | Cost: %f" % (k, cost)) theta = theta - alpha * (x.transpose() * error /m) return theta def plotBestFit(weights): dataMat,labelMat=loadDataSet() dataArr = array(dataMat) n = shape(dataArr)[0] xcord1 = []; ycord1 = [] xcord2 = []; ycord2 = [] for i in range(n): if int(labelMat[i])== 1: xcord1.append(dataArr[i,1]);ycord1.append(dataArr[i,2]) else: xcord2.append(dataArr[i,1]);ycord2.append(dataArr[i,2]) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xcord1, ycord1, s=30, c='red', marker='s') ax.scatter(xcord2, ycord2, s=30, c='green') min_x = min(mat(dataMat)[:, 1]) max_x = max(mat(dataMat)[:, 1]) x = arange(min_x, max_x, 1) y = (-weights[0]-weights[1]*x)/weights[2] ax.plot(x, y) plt.xlabel('X1'); plt.ylabel('X2'); plt.show() dataMat, classLabel = loadDataSet() weights = gradDescent(dataMat, classLabel) print weights plotBestFit(weights.getA())

1条回答

网友

1楼 · 发布于 2024-06-16 15:02:21

你的代码其实很好！以下是一些评论：

你用所有的1初始化θ。在这个例子中我不会这样做。第一次调用sigmoid函数将返回接近1的值，因为theta和{}的乘积给出了非常大的数字。log(1 - h)的计算可能会导致错误，因为log没有在0处定义。我喜欢用0's初始化thetas。
在计算成本函数时，您错过了m的除法。算法不重要，但最好遵循理论。
最好绘制成本函数，而不仅仅是打印其值。正确的趋势可以看得很清楚。
为了收敛，这个特定的例子需要更多的迭代。我在500.000迭代中获得了一个很好的结果。

帖子已更新，请参阅下面的更新

以下是我的情节：

如您所见，生成的分隔线与教程中显示的图非常匹配。在

这是我的密码。它和你的有点不同，但它们非常相似。在

import numpy as np
import matplotlib.pyplot as plt

def loadDataSet():
    dataMat = []; labelMat = []
    frX = open('../ex4x.dat')
    frY = open('../ex4y.dat')
    for line1 in frX.readlines():
        lineArr1 = line1.strip().split()
        dataMat.append([1.0, float(lineArr1[0]), float(lineArr1[1])])

    for line2 in frY.readlines():
        lineArr2 = line2.strip().split()
        labelMat.append([float(lineArr2[0])])
    return dataMat,labelMat

def sigmoid(inX):
    return 1.0/(1+np.exp(-inX))    

def gradDescent(dataMatIn, classLabels, alpha, maxCycles):
    x = np.mat(dataMatIn)
    y = np.mat(classLabels)
    m,n = np.shape(x)
    n = n - 1               #usually n is the number of features (without the 1's)

    theta = np.zeros((n+1,1))

    cost_history = []       #list to accumulate the cost values

    for k in range(maxCycles):

        h = sigmoid(x*theta)

        cost = ((-np.multiply(y, np.log(h)) -np.multiply(1-y, np.log(1-h))).sum(axis=0)/m)[0, 0]

        if ((k % 1000) == 0):
            cost_history.append(cost)   #on each 1000th iteration the cost is saved to a list

        grad = (x.transpose() * (h - y))/m

        theta = theta - alpha*grad

    plot_cost = 1 
    if (plot_cost == 1):
        plt.plot(cost_history)
        plt.title("Cost")
        plt.show()

    return theta   

def plotBestFit(dataMat, classLabel, weights):
    arrY = np.asarray(classLabel)
    arrX = np.asarray(dataMat)
    ind1 = np.where(arrY == 1)[0]
    ind0 = np.where(arrY == 0)[0]

    min_x1 = min(np.mat(dataMat)[:, 1])
    max_x1 = max(np.mat(dataMat)[:, 1])
    x1_val = np.arange(min_x1, max_x1, 1)
    x2_val = (-weights[0, 0]-weights[1, 0]*x1_val)/weights[2, 0]

    plt.scatter(arrX[ind1, 1], arrX[ind1, 2], s=30, c='red', marker='s')
    plt.scatter(arrX[ind0, 1], arrX[ind0, 2], s=30, c='blue', marker='s')
    plt.plot(x1_val, x2_val)
    plt.xlabel('X1', fontsize=18)
    plt.ylabel('X2', fontsize=18)
    plt.title("Separation border")
    plt.show()


dataMat, classLabel = loadDataSet()
weights = gradDescent(dataMat, classLabel, 0.0014, 500000) 

print(weights)
plotBestFit(dataMat, classLabel, weights)

更新

在阅读了您对第一版文章的评论中的问题后，我试图优化代码，以使用更小的迭代次数实现成本函数的收敛。在

事实上，特性标准化创造了奇迹：）

仅仅经过30次迭代，就获得了更好的结果！在

以下是新的情节：

由于标准化，您需要缩放每个新测试示例，以便对其进行分类。在

这是新密码。我更改了一些数据类型以避免不必要的数据类型转换。在

^{pr2}$

相关问题更多 >

编程相关推荐

热门问题

热门文章