机器学习 Python 实践 - 第 3 章 - 分类问题 - 2. Logistic 回归算法 - 3.2.3 实践

最编程 2024-10-12 20:18:12

...

1.数据集

1.线性回归
在线性回归问题中，使用的数据集是sklearn自带的一个糖尿病病人的数据集。该数据集从糖尿病病人采样并整理后，特点如下：

数据集有442个样本。

每个样本有10个特征。

每个特征都是浮点数，数据的范围是 -0.2 ~ 0.2。

样本的目标为25~346的整数。

这里给出加载数据集的函数：
from sklearn import datasets
from sklearn.model_selection import train_test_split


def load_data():
    diabetes = datasets.load_diabetes()
    return train_test_split(diabetes.data, diabetes.target, test_size=0.25, random_state=0)
使用该数据集返回值是一个元组，元组依次是：训练样本集、测试样本集、训练样本集对应的标签值、测试样本集对应的标签值。

2.逻辑回归

为了测试逻辑回归模型的分类性能，此处选用经典的数据集：鸢尾花数据集。

2.sklearn实现

1.线性回归

测试线性回归模型，代码如下：

from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
import numpy as np
import pytest


def load_data():
    diabetes = datasets.load_diabetes()
    return train_test_split(diabetes.data, diabetes.target, test_size=0.25, random_state=0)


@pytest.fixture
def data():
    """Fixture to load data."""
    return load_data()


def test_LinearRegression(data):
    X_train, X_test, y_train, y_test = data
    regression = linear_model.LinearRegression()
    regression.fit(X_train, y_train)
    print('\nCoefficients:%s, intercept:%.2f' % (regression.coef_, regression.intercept_))
    print("Residual sum of squares:%.2f" % np.mean((regression.predict(X_test) - y_test) ** 2))
    print('Score:%.2f' % regression.score(X_test, y_test))

代码结果如下：

测试集中预测结果的均方误差为3180.16，预测性能得分为0.36（该值越大越好，最大为1.0）。

2.逻辑回归

测试逻辑回归模型（使用的数据集是鸢尾花数据集，不是糖尿病数据集），代码如下：

def load_iris():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target
    return model_selection.train_test_split(x, y, test_size=0.3, random_state=1, shuffle=True, stratify=y)


@pytest.fixture
def data2():
    """Fixture to load data."""
    return load_iris()


def test_LogisticRegression(data2):
    X_train, X_test, y_train, y_test = data2
    regression = linear_model.LogisticRegression()
    regression.fit(X_train, y_train)
    print('\nCoefficients:%s, intercept:%s' % (regression.coef_, regression.intercept_))
    print('Score:%.2f' % regression.score(X_test, y_test))

代码结果如下

测试集中的预测结果性能得分为0.98，即预测准确率为98%。

下面考察multi_class参数对分类结果的影响。默认采用的是one-vs-rest策略，但是逻辑回归模型的原型就支持多分类，给出的测试函数如下：

def test_LogisticRegression_multinomial(data2):
    X_train, X_test, y_train, y_test = data2
    regression = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs')
    regression.fit(X_train, y_train)
    print('\nCoefficients:%s, intercept:%s' % (regression.coef_, regression.intercept_))
    print('Score:%.2f' % regression.score(X_test, y_test))

代码结果如下

最后，考察参数C对分类模型的预测性能的影响。C是正则化项系数的倒数，它越小则正则化项的权重越大。测试函数如下：

def test_LogisticRegression_C(data2):
    X_train, X_test, y_train, y_test = data2
    Cs = np.logspace(-2, 4, num=100)
    scores = []
    for C in Cs:
        regression = linear_model.LogisticRegression(C=C)
        regression.fit(X_train, y_train)
        scores.append(regression.score(X_test, y_test))
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.plot(Cs, scores)
    ax.set_xlabel(r"C")
    ax.set_ylabel(r"score")
    ax.set_xscale('log')
    ax.set_title(r"Logistic Regression")
    plt.show()

代码结果如下

测试结果如下图。可以看到随着C的增大（即正则化项减小），LogisticRegression的预测准确率上升。当C增大到一定程度（即正则化项减小到一定程度）时，LogisticRegression的预测准确率维持在较高的水准保持不变。

3.算法实现

为了使用逻辑回归模型对鸢尾花进行分类，此处选用经典的鸢尾花数据集。

现只取数据集Iris中的两个特征Sepal.length（花萼长度）和Petal.length（花瓣长度），定义为，对应 y 分类中的两个类别（0，1），将根据的值对鸢尾花进行分类。首先绘制这两个特征的散点图，代码如下。

from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import numpy as np


iris = load_iris()
data = iris.data
target = iris.target
X = data[0:100, [0, 2]]
y = target[0:100]
label = np.array(y)
index_0 = np.where(label == 0)
plt.scatter(X[index_0, 0], X[index_0, 1], marker='x', color='b', label='0', s=15)
index_1 = np.where(label == 1)
plt.scatter(X[index_1, 0], X[index_1, 1], marker='o', color='r', label='1', s=15)
plt.xlabel('X1')
plt.ylabel('X2')
plt.legend(loc='upper left')
plt.show()

代码结果如下

接着编写一个逻辑回归模型的类，然后训练测试，计算损失函数（损失函数的本质是衡量“模型预估值”到“实际值”的距离）。注意损失函数值越小，模型越好，而且损失函数尽量是一个凸函数，便于收敛计算。逻辑回归模型预估的是样本属于某个分类的概率，其损失函数可以采用均方差、对数、概率等方法。计算损失函数的代码如下：

import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_iris


# Iris 数据集中的目标变量 y 是整数向量（类标签），
# 但逻辑回归模型期望 y 为二进制格式（0 和 1）以进行二进制分类。需要相应地转换标签。
class LogisticRegressionBinary(object):
    def __init__(self):
        self.W = None

    def train(self, X, y, lr=0.01, num_iters=5000):
        num_train, num_feature = X.shape
        self.W = 0.001 * np.random.randn(num_feature, 1).reshape((-1, 1))
        loss = []
        for i in range(num_iters):
            error, dW = self.compute_loss(X, y)
            self.W += - lr * dW
            loss.append(error)
            if i % 200 == 0:
                print('i= %d, error= %f' % (i, error))
        return loss

    def compute_loss(self, X, y):
        num_train = X.shape[0]
        h = self.output(X)
        loss = - np.sum((y * np.log(h) + (1 - y) * np.log(1 - h))) / num_train
        dW = X.T.dot(h - y) / num_train
        return loss, dW

    def output(self, X):
        g = np.dot(X, self.W)
        return self.sigmoid(g)

    def sigmoid(self, X):
        return 1 / (1 + np.exp(-X))

    def predict(self, X_test):
        h = self.output(X_test)
        return np.where(h >= 0.5, 1, 0)


# 加载 Iris 数据集
iris = load_iris()
X = iris.data
y = iris.target

# 鸢尾花数据集有三个类（0，1，2），筛选出类 0 和 1 的数据
# 对于二元分类，筛选类 0 和 1 的数据
binary_filter = y < 2
X = X[binary_filter]
y = y[binary_filter].reshape((-1, 1))

# 在 X 矩阵左侧添加全 1 的列，说明模型中的截距项
one = np.ones((X.shape[0], 1))
X_train = np.hstack((one, X))

# 训练 Logistic 回归模型，使用 Logistic 回归进行二进制分类。
classify = LogisticRegressionBinary()
loss = classify.train(X_train, y)

# 输出学习的权重
print("Learned weights:\n", classify.W)

# 绘制迭代的损失曲线
plt.plot(loss)
plt.xlabel('Iteration number')
plt.ylabel('Loss value')
plt.title('Loss curve for Logistic Regression')
plt.show()

（书上代码达不到给出的效果，具体原因以及修改部分我在上述代码中添加了注释）

训练之后，损失值图将显示误差随着迭代次数的增加而减少。

以绘图的方式对决策边界进行可视化处理，代码如下：

import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_iris


class logistic(object):
    def __init__(self):
        self.W = None

    def train(self, X, y, lr=0.01, num_iters=5000):
        num_train, num_feature = X.shape
        self.W = 0.001 * np.random.randn(num_feature, 1).reshape((-1, 1))
        loss = []
        for i in range(num_iters):
            error, dW = self.compute_loss(X, y)
            self.W += - lr * dW
            loss.append(error)
            if i % 200 == 0:
                print('i= %d, error= %f' % (i, error))
        return loss

    def compute_loss(self, X, y):
        num_train = X.shape[0]
        h = self.output(X)
        loss = - np.sum((y * np.log(h) + (1 - y) * np.log(1 - h))) / num_train
        dW = X.T.dot(h - y) / num_train
        return loss, dW

    def output(self, X):
        g = np.dot(X, self.W)
        return self.sigmoid(g)

    def sigmoid(self, X):
        return 1 / (1 + np.exp(-X))

    def predict(self, X_test):
        h = self.output(X_test)
        return np.where(h >= 0.5, 1, 0)


iris = load_iris()
data = iris.data
target = iris.target
X = data[0:100, [0, 2]]
y = target[0:100]
y = y.reshape((-1, 1))
one = np.ones((X.shape[0], 1))
X_train = np.hstack((one, X))
classify = logistic()
loss = classify.train(X_train, y)
label = np.array(y)
index_0 = np.where(label == 0)
plt.scatter(X[index_0, 0], X[index_0, 1], marker='x', c='b', label='0', s=15)
index_1 = np.where(label == 1)
plt.scatter(X[index_1, 0], X[index_1, 1], marker='o', c='r', label='1', s=15)
# 绘制分类边界线
x1 = np.arange(4, 7.5, 0.5)
x2 = (- classify.W[0] - classify.W[1] * x1) / classify.W[2]
plt.plot(x1, x2, color='black')
plt.xlabel('X1')
plt.ylabel('X2')
plt.legend(loc='upper left')
plt.show()

运行结果如图所示，可以看出，最后学习得到的决策边界成功隔开两个类别。

上一篇：榜上有名！麒麟新安入选 2024 年湖南省重点软件科技项目。

下一篇：单臂路由，实现跨域访问