机器学习:LR逻辑回归(实战)
最编程
2024-08-13 11:02:03
...
import numpy as np
import random
from sklearn.linear_model import LogisticRegression
"""
函数说明:sigmoid函数
Parameters:
inX - 数据
Returns:
sigmoid函数
"""
def sigmoid(inX):
return 1.0 / (1 + np.exp(-inX))
#函数说明:随机梯度上升算法
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
# 返回dataMatrix的大小。m为行数,n为列数。
m, n = np.shape(dataMatrix)
# 参数初始化
weights = np.ones(n)
for j in range(numIter):
dataIndex = list(range(m))
for i in range(m):
# 降低alpha的大小,每次减小1/(j+i)。
alpha = 4 / (1.0 + j + i) + 0.01
# 随机选取样本
randIndex = int(random.uniform(0, len(dataIndex)))
# 选择随机选取的一个样本,计算h
h = sigmoid(sum(dataMatrix[randIndex] * weights))
# 计算误差
error = classLabels[randIndex] - h
# 更新回归系数
weights = weights + alpha * error * dataMatrix[randIndex]
# 删除已经使用的样本
del (dataIndex[randIndex])
return weights
"""
#函数说明:梯度上升算法
Parameters:
dataMatIn - 数据集
classLabels - 数据标签
Returns:
weights.getA() - 求得的权重数组(最优参数)
"""
def gradAscent(dataMatIn, classLabels):
# 转换成numpy的mat
dataMatrix = np.mat(dataMatIn)
# 转换成numpy的mat,并进行转置
labelMat = np.mat(classLabels).transpose()
# 返回dataMatrix的大小。m为行数,n为列数。
m, n = np.shape(dataMatrix)
# 移动步长,也就是学习速率,控制更新的幅度。
alpha = 0.001
# 最大迭代次数
maxCycles = 500
weights = np.ones((n,1))
for k in range(maxCycles):
# 梯度上升矢量化公式
# g(X)=h(theta) = theta * X
h = sigmoid(dataMatrix * weights)
error = labelMat - h
# theta = theta + alpha * X^T(y - g(X))
weights = weights + alpha * dataMatrix.transpose() * error
return weights.getA()
#函数说明:使用Python写的Logistic分类器做预测
def colicTest():
# 打开训练集
frTrain = open('horseColicTraining.txt')
# 打开测试集
frTest = open('horseColicTest.txt')
# 训练数据集
trainingSet = []
# 训练集标签
trainingLabels = []
# 遍历所有训练集
for line in frTrain.readlines():
# 去除空格,并以'\t'分割
currLine = line.strip().split('\t')
lineArr = []
for i in range(len(currLine) - 1):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[-1]))
# trainWeights = gradAscent(np.array(trainingSet), trainingLabels, 500)
# 使用梯度上升训练
trainWeights = gradAscent(np.array(trainingSet), trainingLabels)
errorCount = 0
numTestVec = 0.0
for line in frTest:
numTestVec += 1.0
currLine = line.strip().split('\t')
lineArr = []
for i in range(len(currLine) - 1):
lineArr.append(float(currLine[i]))
# if int(classifyVector(np.array(lineArr), trainWeights)) != int(currLine[-1]): # 改进的梯度上升算法
if int(classifyVector(np.array(lineArr), trainWeights[:, 0])) != int(currLine[-1]):
errorCount += 1
errorRate = (float(errorCount) / numTestVec) * 100
print("测试集合错误率为: %.2f%%" % errorRate)
"""
函数说明:分类函数
Parameters:
inX - 特征向量
weights - 回归系数
returns:
分类结果
"""
def classifyVector(inX, weights):
prob = sigmoid(sum(inX * weights))
if prob > 0.5:
return 1.0
else:
return 0.0
#函数说明:使用sklearn构件Logistic回归分类
def colicSklearn():
# 打开训练集
frTrain = open('horseColicTraining.txt')
# 打开测试集
frTest = open('horseColicTest.txt')
# 训练集和标签
trainingSet = []
trainingLabels = []
# 测试集和标签
testSet = []
testLabels = []
# 遍历训练集
for line in frTrain.readlines():
currLine = line.strip().split('\t')
lineArr = []
for i in range(len(currLine) - 1):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[-1]))
# 遍历测试集
for line in frTest.readlines():
currLine = line.strip().split('\t')
lineArr = []
for i in range(len(currLine) - 1):
lineArr.append(float(currLine[i]))
testSet.append(lineArr)
testLabels.append(float(currLine[-1]))
# sklearn中的分类器
classifier = LogisticRegression(solver='sag', max_iter=5000).fit(trainingSet, trainingLabels)
test_accurcy = classifier.score(testSet, testLabels) * 100
print('正确率:%f%%' % test_accurcy)
if __name__ == '__main__':
colicSklearn()
if __name__ == '__main__':
# 梯度上升算法测试
# colicTest()
# sklearn中的分类器
colicSklearn()
上一篇: 快速入门Python机器学习(33)
下一篇: 逻辑回归参数详解