python信用评分卡建模（附代码，博主录制）

https://study.163.com/course/introduction.htm?courseId=1005214003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

参考资料https://www.cnblogs.com/webRobot/p/9034079.html

逻辑回归重点：

1.sigmoid函数（计算好坏客户概率，0.5为阈值）

2.梯度上升和梯度下降（Z最优化问题）

3.随机梯度上升（解决效率问题）

CSV保存

结构：第一列数值

第二列布尔值

# -*- coding: utf-8 -*-
"""
Created on Fri Jul 21 09:28:25 2017@author: tobyCSV数据结构，第一列为数值，第二列为二分类型
"""
import csv
import numpy as np
import pandas as pd
from statsmodels.formula.api import glm
from statsmodels.genmod.families import Binomial
import matplotlib.pyplot as plt
import seaborn as sns#该函数的其他的两个属性"notebook"和"paper"却不能正常显示中文
sns.set_context('poster')fileName="challenger_data.csv"
reader = csv.reader(open(fileName))#获取数据，类型：阵列
def getData():
    '''Get the data '''    inFile = 'challenger_data.csv'
    data = np.genfromtxt(inFile, skip_header=1, usecols=[1, 2],
                                    missing_values='NA', delimiter=',')
    # Eliminate NaNs 消除NaN数据
    data = data[~np.isnan(data[:, 1])]
    return datadef prepareForFit(inData):
    ''' Make the temperature-values unique, and count the number of failures and successes.
    Returns a DataFrame'''    # Create a dataframe, with suitable columns for the fit
    df = pd.DataFrame()
    #np.unique返回去重的值
    df['temp'] = np.unique(inData[:,0])
    df['failed'] = 0
    df['ok'] = 0
    df['total'] = 0
    df.index = df.temp.values    # Count the number of starts and failures
    #inData.shape[0] 表示数据多少
    for ii in range(inData.shape[0]):
        #获取第一个值的温度
        curTemp = inData[ii,0]
        #获取第一个值的值，是否发生故障
        curVal  = inData[ii,1]
        df.loc[curTemp,'total'] += 1
        if curVal == 1:
            df.loc[curTemp, 'failed'] += 1
        else:
            df.loc[curTemp, 'ok'] += 1
    return df#逻辑回归公式
def logistic(x, beta, alpha=0):
    ''' Logistic Function '''
    #点积，比如np.dot([1,2,3],[4,5,6]) = 1*4 + 2*5 + 3*6 = 32
    return 1.0 / (1.0 + np.exp(np.dot(beta, x) + alpha))    #不太懂
def setFonts(*options):
        return
#绘图
def Plot(data,alpha,beta,picName):
    #阵列，数值
    array_values = data[:,0]
    #阵列，二分类型
    array_type = data[:,1]    plt.figure()
    setFonts()
    #改变指定主题的风格参数
    sns.set_style('darkgrid')
    #numpy输出精度局部控制
    np.set_printoptions(precision=3, suppress=True)
    plt.scatter(array_values, array_type, s=200, color="k", alpha=0.5)
    #获取值列表
    list_values = [row[1] for row in reader][1:]
    list_values = [int(i) for i in list_values]
    #获取列表最大值和最小值
    max_value=max(list_values)
    min_value=min(list_values)
    #最大值和最小值留有多余空间
    x = np.arange(min_value, max_value)
    y = logistic(x, beta, alpha)
    plt.hold(True)
    plt.plot(x,y,'r')
    #设置y轴坐标刻度
    plt.yticks([0, 1])
    #plt.xlim()返回当前的X轴绘图范围
    plt.xlim([min_value,max_value])
    outFile = picName
    plt.ylabel("probability")
    plt.xlabel("values")
    plt.title("logistic regression")
    #产生方格
    plt.hold(True)
    #图像外部边缘的调整
    plt.tight_layout
    plt.show(outFile)#用于预测逻辑回归概率
def Prediction(x):
    y = logistic(x, beta, alpha)
    print("probability prediction:",y)#获取数据
inData = getData()
#得到频率计算后的数据
dfFit = prepareForFit(inData)
#Generalized Linear Model 建立二项式模型
model = glm('ok + failed ~ temp', data=dfFit, family=Binomial()).fit()
print(model.summary())alpha = model.params[0]
beta = model.params[1]Plot(inData,alpha,beta,"logiscti regression")

逻辑回归调参文档

默认参数

class sklearn.linear_model.LogisticRegression(penalty=’l2′, dual=False, tol=0.0001, C=1.0,fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None,solver=’liblinear’, max_iter=100, multi_class=’ovr’, verbose=0,warm_start=False, n_jobs=1)

乳腺癌逻辑回归调参测试脚本

# -*- coding: utf-8 -*-
"""
Created on Mon Jul 30 14:11:27 2018
@author: zhi.li04官网调参文档
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html重要参数
penalty
在调参时如果我们主要的目的只是为了解决过拟合，一般penalty选择L2正则化就够了。但是如果选择L2正则化发现还是过拟合，即预测效果差的时候，就可以考虑L1正则化。另外，如果模型的特征非常多，我们希望一些不重要的特征系数归零，从而让模型系数稀疏化的话，也可以使用L1正则化。
tol:Tolerance for stopping criterian_jobs : int, default: 1
Number of CPU cores used when parallelizing over classes if multi_class=’ovr’”. This parameter is ignored when the ``solver``is set to ‘liblinear’ regardless of whether ‘multi_class’ is specified or not. If given a value of -1, all cores are used.
"""from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancercancer=load_breast_cancer()
x_train,x_test,y_train,y_test=train_test_split(cancer.data,cancer.target,random_state=0)#默认参数
logist=LogisticRegression()
logist.fit(x_train,y_train)print("logistic regression with default parameters:")
print("accuracy on the training subset:{:.3f}".format(logist.score(x_train,y_train)))
print("accuracy on the test subset:{:.3f}".format(logist.score(x_test,y_test)))
print(logist.sparsify())
print("coefficience:")
print(logist.coef_)
print("intercept:")
print (logist.intercept_)
print()
'''
默认是l2惩罚
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
'''#正则化l1
logist2=LogisticRegression(C=1,penalty='l1',tol=0.01)
logist2.fit(x_train,y_train)print("logistic regression with pernalty l1:")
print("accuracy on the training subset:{:.3f}".format(logist2.score(x_train,y_train)))
print("accuracy on the test subset:{:.3f}".format(logist2.score(x_test,y_test)))
print("coefficience:")
print(logist2.coef_)
print("intercept:")
print (logist2.intercept_)
print()
'''
logistic regression:
accuracy on the training subset:0.955
accuracy on the test subset:0.958logist.sparsify()
Out[6]:
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.01,
          verbose=0, warm_start=False)len(logist2.coef_[0])
Out[20]: 30
'''#正则化l2
logist3=LogisticRegression(C=1,penalty='l2',tol=0.01)
logist3.fit(x_train,y_train)print("logistic regression with pernalty l2:")
print("accuracy on the training subset:{:.3f}".format(logist3.score(x_train,y_train)))
print("accuracy on the test subset:{:.3f}".format(logist3.score(x_test,y_test)))
print("coefficience:")
print(logist3.coef_)
print("intercept:")
print (logist3.intercept_)
print()

正则化选择参数（惩罚项的种类）

penalty : str, ‘l1’or ‘l2’, default: ‘l2’

Usedto specify the norm used in the penalization. The ‘newton-cg’, ‘sag’ and‘lbfgs’ solvers support only l2 penalties.

LogisticRegression默认带了正则化项。penalty参数可选择的值为”l1″和”l2″.分别对应L1的正则化和L2的正则化，默认是L2的正则化。
在调参时如果我们主要的目的只是为了解决过拟合，一般penalty选择L2正则化就够了。但是如果选择L2正则化发现还是过拟合，即预测效果差的时候，就可以考虑L1正则化。另外，如果模型的特征非常多，我们希望一些不重要的特征系数归零，从而让模型系数稀疏化的话，也可以使用L1正则化。
penalty参数的选择会影响我们损失函数优化算法的选择。即参数solver的选择，如果是L2正则化，那么4种可选的算法{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’}都可以选择。但是如果penalty是L1正则化的话，就只能选择‘liblinear’了。这是因为L1正则化的损失函数不是连续可导的，而{‘newton-cg’, ‘lbfgs’,‘sag’}这三种优化算法时都需要损失函数的一阶或者二阶连续导数。而‘liblinear’并没有这个依赖。

dual : bool, default: False

Dualor primal formulation. Dual formulation is only implemented for l2 penalty withliblinear solver. Prefer dual=False whenn_samples > n_features.

对偶或者原始方法。Dual只适用于正则化相为l2 liblinear的情况，通常样本数大于特征数的情况下，默认为False。

C : float, default: 1.0

Inverseof regularization strength; must be a positive float. Like in support vectormachines, smaller values specify stronger regularization.

C为正则化系数λ的倒数，通常默认为1

fit_intercept : bool, default: True

Specifiesif a constant (a.k.a. bias or intercept) should be added to the decisionfunction.

是否存在截距，默认存在

intercept_scaling : float, default 1.

Usefulonly when the solver ‘liblinear’ is used and self.fit_intercept is set to True.In this case, x becomes [x, self.intercept_scaling], i.e. a “synthetic” featurewith constant value equal to intercept_scaling is appended to the instancevector. The intercept becomes intercept_scaling * synthetic_feature_weight.

Note!the synthetic feature weight is subject to l1/l2 regularization as all otherfeatures. To lessen the effect of regularization on synthetic feature weight(and therefore on the intercept) intercept_scaling has to be increased.

仅在正则化项为”liblinear”，且fit_intercept设置为True时有用。

优化算法选择参数

solver

{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’}, default: ‘liblinear’

Algorithmto use in the optimization problem.

Forsmall datasets, ‘liblinear’ is a good choice, whereas ‘sag’ is

fasterfor large ones.

Formulticlass problems, only ‘newton-cg’, ‘sag’ and ‘lbfgs’ handle

multinomialloss; ‘liblinear’ is limited to one-versus-rest schemes.

‘newton-cg’,‘lbfgs’ and ‘sag’ only handle L2 penalty.

Notethat ‘sag’ fast convergence is only guaranteed on features with approximatelythe same scale. You can preprocess the data with a scaler fromsklearn.preprocessing.

Newin version 0.17: Stochastic Average Gradient descent solver.

solver参数决定了我们对逻辑回归损失函数的优化方法，有四种算法可以选择，分别是：

a) liblinear：使用了开源的liblinear库实现，内部使用了坐标轴下降法来迭代优化损失函数。

b) lbfgs：拟牛顿法的一种，利用损失函数二阶导数矩阵即海森矩阵来迭代优化损失函数。

c) newton-cg：也是牛顿法家族的一种，利用损失函数二阶导数矩阵即海森矩阵来迭代优化损失函数。

d) sag：即随机平均梯度下降，是梯度下降法的变种，和普通梯度下降法的区别是每次迭代仅仅用一部分的样本来计算梯度，适合于样本数据多的时候。

从上面的描述可以看出，newton-cg, lbfgs和sag这三种优化算法时都需要损失函数的一阶或者二阶连续导数，因此不能用于没有连续导数的L1正则化，只能用于L2正则化。而liblinear通吃L1正则化和L2正则化。
同时，sag每次仅仅使用了部分样本进行梯度迭代，所以当样本量少的时候不要选择它，而如果样本量非常大，比如大于10万，sag是第一选择。但是sag不能用于L1正则化，所以当你有大量的样本，又需要L1正则化的话就要自己做取舍了。要么通过对样本采样来降低样本量，要么回到L2正则化。
从上面的描述，大家可能觉得，既然newton-cg, lbfgs和sag这么多限制，如果不是大样本，我们选择liblinear不就行了嘛！错，因为liblinear也有自己的弱点！我们知道，逻辑回归有二元逻辑回归和多元逻辑回归。对于多元逻辑回归常见的有one-vs-rest(OvR)和many-vs-many(MvM)两种。而MvM一般比OvR分类相对准确一些。郁闷的是liblinear只支持OvR，不支持MvM，这样如果我们需要相对精确的多元逻辑回归时，就不能选择liblinear了。也意味着如果我们需要相对精确的多元逻辑回归不能使用L1正则化了。

总结几种优化算法适用情况：

L1	liblinear	liblinear适用于小数据集；如果选择L2正则化发现还是过拟合，即预测效果差的时候，就可以考虑L1正则化；如果模型的特征非常多，希望一些不重要的特征系数归零，从而让模型系数稀疏化的话，也可以使用L1正则化。
L2	liblinear	libniear只支持多元逻辑回归的OvR，不支持MvM，但MVM相对精确。
L2	lbfgs/newton-cg/sag	较大数据集，支持one-vs-rest(OvR)和many-vs-many(MvM)两种多元逻辑回归。
L2	sag	如果样本量非常大，比如大于10万，sag是第一选择；但不能用于L1正则化。

　　　　具体OvR和MvM有什么不同下一节讲。

分类方式选择参数：

multi_class : str, {‘ovr’, ‘multinomial’}, default:‘ovr’

Multiclassoption can be either ‘ovr’ or ‘multinomial’. If the option chosen is ‘ovr’,then a binary problem is fit for each label. Else the loss minimised is themultinomial loss fit across the entire probability distribution. Works only forthe ‘newton-cg’, ‘sag’ and ‘lbfgs’ solver.

Newin version 0.18: Stochastic Average Gradient descent solver for ‘multinomial’case.

ovr即前面提到的one-vs-rest(OvR)，而multinomial即前面提到的many-vs-many(MvM)。如果是二元逻辑回归，ovr和multinomial并没有任何区别，区别主要在多元逻辑回归上。
OvR和MvM有什么不同？

OvR的思想很简单，无论你是多少元逻辑回归，我们都可以看做二元逻辑回归。具体做法是，对于第K类的分类决策，我们把所有第K类的样本作为正例，除了第K类样本以外的所有样本都作为负例，然后在上面做二元逻辑回归，得到第K类的分类模型。其他类的分类模型获得以此类推。

而MvM则相对复杂，这里举MvM的特例one-vs-one(OvO)作讲解。如果模型有T类，我们每次在所有的T类样本里面选择两类样本出来，不妨记为T1类和T2类，把所有的输出为T1和T2的样本放在一起，把T1作为正例，T2作为负例，进行二元逻辑回归，得到模型参数。我们一共需要T(T-1)/2次分类。

可以看出OvR相对简单，但分类效果相对略差（这里指大多数样本分布情况，某些样本分布下OvR可能更好）。而MvM分类相对精确，但是分类速度没有OvR快。如果选择了ovr，则4种损失函数的优化方法liblinear，newton-cg,lbfgs和sag都可以选择。但是如果选择了multinomial,则只能选择newton-cg, lbfgs和sag了。

类型权重参数：（考虑误分类代价敏感、分类类型不平衡的问题）

class_weight : dictor ‘balanced’, default: None

Weightsassociated with classes in the form {class_label: weight}. If not given, allclasses are supposed to have weight one.

The“balanced” mode uses the values of y to automatically adjust weights inverselyproportional to class frequencies in the input data as n_samples / (n_classes *np.bincount(y)).

Notethat these weights will be multiplied with sample_weight (passed through thefit method) if sample_weight is specified.

Newin version 0.17: class_weight=’balanced’ instead of deprecatedclass_weight=’auto’.

class_weight参数用于标示分类模型中各种类型的权重，可以不输入，即不考虑权重，或者说所有类型的权重一样。如果选择输入的话，可以选择balanced让类库自己计算类型权重，或者我们自己输入各个类型的权重，比如对于0,1的二元模型，我们可以定义class_weight={0:0.9, 1:0.1}，这样类型0的权重为90%，而类型1的权重为10%。
如果class_weight选择balanced，那么类库会根据训练样本量来计算权重。某种类型样本量越多，则权重越低，样本量越少，则权重越高。当class_weight为balanced时，类权重计算方法如下：n_samples / (n_classes * np.bincount(y))

n_samples为样本数，n_classes为类别数量，np.bincount(y)会输出每个类的样本数，例如y=[1,0,0,1,1],则np.bincount(y)=[2,3]

那么class_weight有什么作用呢？

在分类模型中，我们经常会遇到两类问题：

第一种是误分类的代价很高。比如对合法用户和非法用户进行分类，将非法用户分类为合法用户的代价很高，我们宁愿将合法用户分类为非法用户，这时可以人工再甄别，但是却不愿将非法用户分类为合法用户。这时，我们可以适当提高非法用户的权重。

第二种是样本是高度失衡的，比如我们有合法用户和非法用户的二元样本数据10000条，里面合法用户有9995条，非法用户只有5条，如果我们不考虑权重，则我们可以将所有的测试集都预测为合法用户，这样预测准确率理论上有99.95%，但是却没有任何意义。这时，我们可以选择balanced，让类库自动提高非法用户样本的权重。

提高了某种分类的权重，相比不考虑权重，会有更多的样本分类划分到高权重的类别，从而可以解决上面两类问题。

当然，对于第二种样本失衡的情况，我们还可以考虑用下一节讲到的样本权重参数： sample_weight，而不使用class_weight。sample_weight在下一节讲。

样本权重参数：

sample_weight（fit函数参数）

当样本是高度失衡的，导致样本不是总体样本的无偏估计，从而可能导致我们的模型预测能力下降。遇到这种情况，我们可以通过调节样本权重来尝试解决这个问题。调节样本权重的方法有两种，第一种是在class_weight使用balanced。第二种是在调用fit函数时，通过sample_weight来自己调节每个样本权重。在scikit-learn做逻辑回归时，如果上面两种方法都用到了，那么样本的真正权重是class_weight*sample_weight.

max_iter : int, default: 100

Usefulonly for the newton-cg, sag and lbfgs solvers. Maximum number of iterationstaken for the solvers to converge.

仅在正则化优化算法为newton-cg, sag and lbfgs 才有用，算法收敛的最大迭代次数。

random_state : int seed, RandomState instance, default: None

The seed of the pseudo random number generator touse when shuffling the data. Used only in solvers ‘sag’ and ‘liblinear’.

随机数种子，默认为无，仅在正则化优化算法为sag,liblinear时有用。

tol : float, default: 1e-4

Tolerance for stopping criteria.迭代终止判据的误差范围。

verbose : int, default: 0

Forthe liblinear and lbfgs solvers set verbose to any positive number forverbosity.

日志冗长度int：冗长度；0：不输出训练过程；1：偶尔输出； >1：对每个子模型都输出

warm_start : bool, default: False

Whenset to True, reuse the solution of the previous call to fit as initialization,otherwise, just erase the previous solution. Useless for liblinear solver.

Newin version 0.17: warm_start to support lbfgs, newton-cg, sag solvers.

是否热启动，如果是，则下一次训练是以追加树的形式进行（重新使用上一次的调用作为初始化），bool：热启动，False：默认值

n_jobs : int, default: 1

Numberof CPU cores used during the cross-validation loop. If given a value of -1, allcores are used.

并行数，int：个数；-1：跟CPU核数一致；1:默认值

LogisticRegression类中的方法

LogisticRegression类中的方法有如下几种，常用的是fit和predict

1 Logistic回归

假设现在有一些数据点，我们利用一条直线对这些点进行拟合(该线称为最佳拟合直线)，这个拟合过程就称作为回归，如下图所示：

Logistic回归是回归的一种方法，它利用的是Sigmoid函数阈值在[0,1]这个特性。Logistic回归进行分类的主要思想是：根据现有数据对分类边界线建立回归公式，以此进行分类。其实，Logistic本质上是一个基于条件概率的判别模型(Discriminative Model)。

乳腺癌数据-逻辑回归源代码

算法思路：

sigmoid函数计算二分类概率

梯度上升法获取最优系数

随机梯度上升法获取最优系数，同时节约时间

# -*- coding: utf-8 -*-
"""
Created on Mon Jul 30 17:25:33 2018@author: zhi.li04
"""from numpy import *
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metricsfrom sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancerdf=pd.read_excel("test.xlsx")
labelMat=df["label"]
dataMat=df[["x1","x2"]]def sigmoid(inX):
    return 1.0/(1+exp(-inX))#梯度递增函数,返回最优系数
def gradAscent(dataMatIn, classLabels):
    dataMatrix = mat(dataMatIn)             #convert to NumPy matrix
    labelMat = mat(classLabels).transpose() #convert to NumPy matrix,transpose是置换
    m,n = shape(dataMatrix)                 #查看矩阵或者数组的维数
    alpha = 0.001
    maxCycles = 500
    weights = ones((n,1))
    for k in range(maxCycles):              #heavy on matrix operations
        h = sigmoid(dataMatrix*weights)     #matrix mult
        error = (labelMat - h)              #vector subtraction
        weights = weights + alpha * dataMatrix.transpose()* error #matrix mult
    return weights#改进的梯度递增函数
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
    m,n = shape(dataMatrix)
    weights = ones(n)   #initialize to all ones
    for j in range(numIter):
        dataIndex = range(m)
        for i in range(m):
            alpha = 4/(1.0+j+i)+0.0001    #apha decreases with iteration, does not
            randIndex = int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant
            h = sigmoid(sum(dataMatrix[randIndex]*weights))
            error = classLabels[randIndex] - h
            weights = weights + alpha * error * dataMatrix[randIndex]
            #python3不支持，需要改进
            del(dataIndex[randIndex])
    return weightsdef classifyVector(inX, weights):
    prob = sigmoid(sum(inX*weights))
    if prob > 0.5: return 1.0
    else: return 0.0coef_matrix=gradAscent(dataMat, labelMat)cancer=load_breast_cancer()
x_train,x_test,y_train,y_test=train_test_split(cancer.data,cancer.target,random_state=0)
weights=gradAscent(x_train, y_train)predict=classifyVector(x_train[0], weights)
'''
exp(0)
Out[4]: 1.0exp(1)
Out[5]: 2.7182818284590451x = np.arange(4).reshape((2,2))x
Out[24]:
array([[0, 1],
       [2, 3]])np.transpose(x)
Out[25]:
array([[0, 2],
       [1, 3]])classifyVector(x_train[0], weights)
Out[50]: 1.0classifyVector(x_train[1], weights)
Out[51]: 1.0classifyVector(x_train[2], weights)
__main__:24: RuntimeWarning: overflow encountered in exp
Out[52]: 0.0classifyVector(x_train[3], weights)
Out[53]: 1.0
'''

sigmoid函数

最早Logistic函数是皮埃尔·弗朗索瓦·韦吕勒在1844或1845年在研究它与人口增长的关系时命名的。广义Logistic曲线可以模仿一些情况人口增长（P）的 S 形曲线。起初阶段大致是指数增长；然后随着开始变得饱和，增加变慢；最后，达到成熟时增加停止。

Sigmoid函数是一个在生物学中常见的S型函数，也称为S型生长曲线。 [1] 在信息科学中，由于其单增以及反函数单增等性质，Sigmoid函数常被用作神经网络的阈值函数，将变量映射到0,1之间。

sigmoid公式

import sys
from pylab import*t=arange(-10,10,0.1)
s=1/(1+exp(-t))plot(t,s)
plt.xlabel("x")
plt.ylabel("sigmoid(x)")

虽然sigmoid函数拥有良好的性质，可以用在分类问题上，如作为逻辑回归模型的分类器。但为什么偏偏选用这个函数呢？除了上述的数学上更易处理外，还有其本身的推导特性。
对于分类问题，尤其是二分类问题，都假定是服从伯努利分布。伯努利分布的概率质量函数PMF为：

机器学习中一个重要的预测模型逻辑回归（LR）就是基于Sigmoid函数实现的。LR模型的主要任务是给定一些历史的{X,Y}，其中X是样本n个特征值，Y的取值是{0,1}代表正例与负例，通过对这些历史样本的学习，从而得到一个数学模型，给定一个新的X，能够预测出Y。LR模型是一个二分类模型，即对于一个X，预测其发生或不发生。但事实上，对于一个事件发生的情况，往往不能得到100%的预测，因此LR可以得到一个事件发生的可能性，超过50%则认为事件发生，低于50%则认为事件不发生

从LR的目的上来看，在选择函数时，有两个条件是必须要满足的：
1. 取值范围在0~1之间。
2. 对于一个事件发生情况，50%是其结果的分水岭，选择函数应该在0.5中心对称。

从这两个条件来看，Sigmoid很好的符合了LR的需求。关于逻辑回归的具体实现与相关问题，可看这篇文章Logistic函数（sigmoid函数） – wenjun’s blog，在此不再赘述。

用最大似然估计求逻辑回归参数

https://blog.csdn.net/star_liux/article/details/39666737

一.最大似然估计

选择一个(一组)参数使得实验结果具有最大概率。

A. 如果分布是离散型的，其分布律,是待估计的参数，这里我们假设为已知量，则：设X1, X2, … , Xn 是来自于X的样本,X1,X2,…Xn的联合分布律为：

（1）

设x1,x2,…xn是X1,X2,..Xn的一个样本值，则可知X1,..Xn取x1,..,x2的概率，即事件{X1 = x1,…,Xn=xn}发生的概率为：

（2）

这里，因为样本值是已知的，所以(2)是的函数，称为样本的似然函数。

最大似然估计：已知样本值x1,…xn,选取一组参数,使概率达到最大值，此时的为最大估计值。即取使得：

与x1,…,xn有关，记为并称其为参数的极大似然估计值。

B.如果分布X是连续型，其概率密度的形式已知，为待估计参数，则事件X1,…Xn的联合密度为：

(3)

设x1,..xn为相应X1,…Xn的一个样本值，则随机点(X1,…,Xn)落在(x1,..xn)的领域内的概率近似为：

(4)

最大似然估计即为求值，使得(4)的概率最大。由于

不随而变，故似然函数为：

(5)

C. 求最大似然估计参数的步骤：

(1) 写出似然函数：

(6)

这里，n为样本数量，似然函数表示n个样本(事件)同时发生的概率。

(2) 对似然函数取对数：

(3) 将对数似然函数对各参数求偏导数并令其为0，得到对数似然方程组。

(4) 从方程组中解出各个参数。

D. 举例：

设;为未知参数，x1,…xn为来自X的一个样本值。求的极大似然估计值。

解：X的概率密度为：

似然函数为：

令即：

解得：带入解得

二.逻辑回归

逻辑回归不是回归，而是分类。是从线性回归中衍生出来的分类策略。当y值为只有两个值时(比如0，1)，线性回归不能很好的拟合时，用逻辑回归来对其进行二值分类。

这里逻辑函数(S型函数)为：

(7)

于是，可得估计函数：

（8）

这里，我们的目的是求出一组值，使得这组可以很好的模拟出训练样本的类值。

由于二值分类很像二项分布，我们把单一样本的类值假设为发生概率，则：

（9）

可以写成概率一般式：

(10)

由最大似然估计原理，我们可以通过m个训练样本值，来估计出值，使得似然函数值最大：

（11）

这里，为m个训练样本同时发生的概率。对求log，得：

（12）

我们用随机梯度上升法，求使最大化时的值，迭代函数为：

（13）

这里对每个分量进行求导，得：

（14）

于是，随机梯度上升法迭代算法为：

repeat until convergence{

for i = 1 to m{

(15)

}

思考：

我们求最大似然函数参数的立足点是步骤C，即求出每个参数方向上的偏导数，并让偏导数为0，最后求解此方程组。由于中参数数量的不确定，考虑到可能参数数量很大，此时直接求解方程组的解变的很困难。于是，我们用随机梯度上升法，求解方程组的值。

备注：

(a) 公式(14)的化简基于g(z)导函数，如下：

（16）

(b) 下图为逻辑函数g(z)的分布图：

https://blog.csdn.net/zjuPeco/article/details/77165974

前言

逻辑回归是分类当中极为常用的手段，因此，掌握其内在原理是非常必要的。我会争取在本文中尽可能简明地展现逻辑回归(logistic regression)的整个推导过程。

什么是逻辑回归

逻辑回归在某些书中也被称为对数几率回归，明明被叫做回归，却用在了分类问题上，我个人认为这是因为逻辑回归用了和回归类似的方法来解决了分类问题。
假设有一个二分类问题，输出为y∈{0,1}y∈{0,1}，而线性回归模型产生的预测值为z=wTx+bz=wTx+b是实数值，我们希望有一个理想的阶跃函数来帮我们实现zz值到0/10/1值的转化。

ϕ(z)=⎧⎩⎨00.51if z < 0if z = 0if z > 0ϕ(z)={0if z < 00.5if z = 01if z > 0

然而该函数不连续，我们希望有一个单调可微的函数来供我们使用，于是便找到了Sigmoid functionSigmoid function来替代。

ϕ(z)=11+e−zϕ(z)=11+e−z

两者的图像如下图所示（图片出自文献2）

图1：sigmoid & step function

有了Sigmoid fuctionSigmoid fuction之后，由于其取值在[0,1][0,1]，我们就可以将其视为类11的后验概率估计p(y=1|x)p(y=1|x)。说白了，就是如果有了一个测试点xx，那么就可以用Sigmoid fuctionSigmoid fuction算出来的结果来当做该点xx属于类别11的概率大小。
于是，非常自然地，我们把Sigmoid fuctionSigmoid fuction计算得到的值大于等于0.50.5的归为类别11，小于0.50.5的归为类别00。

y^={10if ϕ(z)≥0.5otherwisey^={1if ϕ(z)≥0.50otherwise

同时逻辑回归于自适应线性网络非常相似，两者的区别在于逻辑回归的激活函数时Sigmoid functionSigmoid function而自适应线性网络的激活函数是y=xy=x，两者的网络结构如下图所示（图片出自文献1）。

图2：自适应线性网络

图3：逻辑回归网络

逻辑回归的代价函数

好了，所要用的几个函数我们都好了，接下来要做的就是根据给定的训练集，把参数ww给求出来了。要找参数ww，首先就是得把代价函数（cost function）给定义出来，也就是目标函数。
我们第一个想到的自然是模仿线性回归的做法，利用误差平方和来当代价函数。

J(w)=∑i12(ϕ(z(i))−y(i))2J(w)=∑i12(ϕ(z(i))−y(i))2

其中，z(i)=wTx(i)+bz(i)=wTx(i)+b，ii表示第ii个样本点，y(i)y(i)表示第ii个样本的真实值，ϕ(z(i))ϕ(z(i))表示第ii个样本的预测值。
这时，如果我们将ϕ(z(i))=11+e−z(i)ϕ(z(i))=11+e−z(i)代入的话，会发现这时一个非凸函数，这就意味着代价函数有着许多的局部最小值，这不利于我们的求解。

图4：凸函数和非凸函数

那么我们不妨来换一个思路解决这个问题。前面，我们提到了ϕ(z)ϕ(z)可以视为类11的后验估计，所以我们有

p(y=1|x;w)=ϕ(wTx+b)=ϕ(z)p(y=1|x;w)=ϕ(wTx+b)=ϕ(z)

p(y=0|x;w)=1−ϕ(z)p(y=0|x;w)=1−ϕ(z)

其中，p(y=1|x;w)p(y=1|x;w)表示给定ww，那么xx点y=1y=1的概率大小。
上面两式可以写成一般形式

p(y|x;w)=ϕ(z)y(1−ϕ(z))(1−y)p(y|x;w)=ϕ(z)y(1−ϕ(z))(1−y)

接下来我们就要用极大似然估计来根据给定的训练集估计出参数ww。

L(w)=∏ni=1p(y(i)|x(i);w)=∏ni=1(ϕ(z(i)))y(i)(1−ϕ(z(i)))1−y(i)L(w)=∏i=1np(y(i)|x(i);w)=∏i=1n(ϕ(z(i)))y(i)(1−ϕ(z(i)))1−y(i)

为了简化运算，我们对上面这个等式的两边都取一个对数

l(w)=lnL(w)=∑ni=1y(i)ln(ϕ(z(i)))+(1−y(i))ln(1−ϕ(z(i)))l(w)=lnL(w)=∑i=1ny(i)ln(ϕ(z(i)))+(1−y(i))ln(1−ϕ(z(i)))

我们现在要求的是使得l(w)l(w)最大的ww。没错，我们的代价函数出现了，我们在l(w)l(w)前面加个负号不就变成就最小了吗？不就变成我们代价函数了吗？

J(w)=−l(w)=−∑ni=1y(i)ln(ϕ(z(i)))+(1−y(i))ln(1−ϕ(z(i)))J(w)=−l(w)=−∑i=1ny(i)ln(ϕ(z(i)))+(1−y(i))ln(1−ϕ(z(i)))

为了更好地理解这个代价函数，我们不妨拿一个例子的来看看

J(ϕ(z),y;w)=−yln(ϕ(z))−(1−y)ln(1−ϕ(z))J(ϕ(z),y;w)=−yln(ϕ(z))−(1−y)ln(1−ϕ(z))

也就是说

J(ϕ(z),y;w)={−ln(ϕ(z))−ln(1−ϕ(z))if y=1if y=0J(ϕ(z),y;w)={−ln(ϕ(z))if y=1−ln(1−ϕ(z))if y=0

我们来看看这是一个怎么样的函数

图5：代价函数

从图中不难看出，如果样本的值是11的话，估计值ϕ(z)ϕ(z)越接近11付出的代价就越小，反之越大；同理，如果样本的值是00的话，估计值ϕ(z)ϕ(z)越接近00付出的代价就越小，反之越大。

利用梯度下降法求参数

在开始梯度下降之前，要这里插一句，sigmoid functionsigmoid function有一个很好的性质就是

ϕ′(z)=ϕ(z)(1−ϕ(z))ϕ′(z)=ϕ(z)(1−ϕ(z))

下面会用到这个性质。
还有，我们要明确一点，梯度的负方向就是代价函数下降最快的方向。什么？为什么？好，我来说明一下。借助于泰特展开，我们有

f(x+δ)−f(x)≈f′(x)⋅δf(x+δ)−f(x)≈f′(x)⋅δ

其中，f′(x)f′(x)和δδ为向量，那么这两者的内积就等于

f′(x)⋅δ=||f′(x)||⋅||δ||⋅cosθf′(x)⋅δ=||f′(x)||⋅||δ||⋅cosθ

当θ=πθ=π时，也就是δδ在f′(x)f′(x)的负方向上时，取得最小值，也就是下降的最快的方向了~
okay？好，坐稳了，我们要开始下降了。

w:=w+Δw, Δw=−η∇J(w)w:=w+Δw, Δw=−η∇J(w)

没错，就是这么下降。没反应过来？那我再写详细一些

wj:=wj+Δwj, Δwj=−η∂J(w)∂wjwj:=wj+Δwj, Δwj=−η∂J(w)∂wj

其中，wjwj表示第jj个特征的权重；ηη为学习率，用来控制步长。
重点来了。

∂J(w)wj=−∑ni=1(y(i)1ϕ(z(i))−(1−y(i))11−ϕ(z(i)))∂ϕ(z(i))∂wj=−∑ni=1(y(i)1ϕ(z(i))−(1−y(i))11−ϕ(z(i)))ϕ(z(i))(1−ϕ(z(i)))∂z(i)∂wj=−∑ni=1(y(i)(1−ϕ(z(i)))−(1−y(i))ϕ(z(i)))x(i)j=−∑ni=1(y(i)−ϕ(z(i)))x(i)j∂J(w)wj=−∑i=1n(y(i)1ϕ(z(i))−(1−y(i))11−ϕ(z(i)))∂ϕ(z(i))∂wj=−∑i=1n(y(i)1ϕ(z(i))−(1−y(i))11−ϕ(z(i)))ϕ(z(i))(1−ϕ(z(i)))∂z(i)∂wj=−∑i=1n(y(i)(1−ϕ(z(i)))−(1−y(i))ϕ(z(i)))xj(i)=−∑i=1n(y(i)−ϕ(z(i)))xj(i)

所以，在使用梯度下降法更新权重时，只要根据下式即可

wj:=wj+η∑ni=1(y(i)−ϕ(z(i)))x(i)jwj:=wj+η∑i=1n(y(i)−ϕ(z(i)))xj(i)

此式与线性回归时更新权重用的式子极为相似，也许这也是逻辑回归要在后面加上回归两个字的原因吧。
当然，在样本量极大的时候，每次更新权重会非常耗费时间，这时可以采用随机梯度下降法，这时每次迭代时需要将样本重新打乱，然后用下式不断更新权重。

wj:=wj+η(y(i)−ϕ(z(i)))x(i)j,for i in range(n)wj:=wj+η(y(i)−ϕ(z(i)))xj(i),for i in range(n)

也就是去掉了求和，而是针对每个样本点都进行更新。

结束语

以上就是我参考了基本书中的说法之后对逻辑回归整个推到过程的梳理，也不知道讲清楚没有。
如有不足，还请指正~

参考文献

[1] Raschka S. Python Machine Learning[M]. Packt Publishing, 2015.
[2] 周志华. 机器学习 : = Machine learning[M]. 清华大学出版社, 2016.

梯度上升法

https://blog.csdn.net/u013709270/article/details/78667531

梯度下降算法（Gradient Descent Optimization）是神经网络模型训练最常用的优化算法。对于深度学习模型，基本都是采用梯度下降算法来进行优化训练的。梯度下降算法背后的原理：目标函数关于参数的梯度将是目标函数上升最快的方向。对于最小化优化问题，只需要将参数沿着梯度相反的方向前进一个步长，就可以实现目标函数的下降。这个步长又称为学习速率。参数更新公式如下：

其中是参数的梯度，根据计算目标函数采用数据量的不同，梯度下降算法又可以分为批量梯度下降算法（Batch Gradient Descent），随机梯度下降算法（Stochastic GradientDescent）和小批量梯度下降算法（Mini-batch Gradient Descent）。对于批量梯度下降算法，其是在整个训练集上计算的，如果数据集比较大，可能会面临内存不足问题，而且其收敛速度一般比较慢。随机梯度下降算法是另外一个极端，是针对训练集中的一个训练样本计算的，又称为在线学习，即得到了一个样本，就可以执行一次参数更新。所以其收敛速度会快一些，但是有可能出现目标函数值震荡现象，因为高频率的参数更新导致了高方差。小批量梯度下降算法是折中方案，选取训练集中一个小批量样本计算，这样可以保证训练过程更稳定，而且采用批量训练方法也可以利用矩阵计算的优势。这是目前最常用的梯度下降算法。

对于神经网络模型，借助于BP算法可以高效地计算梯度，从而实施梯度下降算法。但梯度下降算法一个老大难的问题是：不能保证全局收敛。如果这个问题解决了，深度学习的世界会和谐很多。梯度下降算法针对凸优化问题原则上是可以收敛到全局最优的，因为此时只有唯一的局部最优点。而实际上深度学习模型是一个复杂的非线性结构，一般属于非凸问题，这意味着存在很多局部最优点（鞍点），采用梯度下降算法可能会陷入局部最优，这应该是最头疼的问题。这点和进化算法如遗传算法很类似，都无法保证收敛到全局最优。因此，我们注定在这个问题上成为“高级调参师”。可以看到，梯度下降算法中一个重要的参数是学习速率，适当的学习速率很重要：学习速率过小时收敛速度慢，而过大时导致训练震荡，而且可能会发散。理想的梯度下降算法要满足两点：收敛速度要快；能全局收敛。为了这个理想，出现了很多经典梯度下降算法的变种，下面将分别介绍它们。

Momentum optimization

冲量梯度下降算法是BorisPolyak在1964年提出的，其基于这样一个物理事实：将一个小球从山顶滚下，其初始速率很慢，但在加速度作用下速率很快增加，并最终由于阻力的存在达到一个稳定速率。对于冲量梯度下降算法，其更新方程如下：

可以看到，参数更新时不仅考虑当前梯度值，而且加上了一个积累项（冲量），但多了一个超参，一般取接近1的值如0.9。相比原始梯度下降算法，冲量梯度下降算法有助于加速收敛。当梯度与冲量方向一致时，冲量项会增加，而相反时，冲量项减少，因此冲量梯度下降算法可以减少训练的震荡过程。TensorFlow中提供了这一优化器：tf.train.MomentumOptimizer(learning_rate=learning_rate,momentum=0.9)。

NAG

NAG算法全称Nesterov Accelerated Gradient,是YuriiNesterov在1983年提出的对冲量梯度下降算法的改进版本，其速度更快。其变化之处在于计算“超前梯度”更新冲量项，具体公式如下：

既然参数要沿着更新，不妨计算未来位置的梯度，然后合并两项作为最终的更新项，其具体效果如图1所示，可以看到一定的加速效果。在TensorFlow中，NAG优化器为：tf.train.MomentumOptimizer(learning_rate=learning_rate,momentum=0.9, use_nesterov=True)

图1 NAG效果图

AdaGrad

AdaGrad是Duchi在2011年提出的一种学习速率自适应的梯度下降算法。在训练迭代过程，其学习速率是逐渐衰减的，经常更新的参数其学习速率衰减更快，这是一种自适应算法。其更新过程如下：

其中是梯度平方的积累量，在进行参数更新时，学习速率要除以这个积累量的平方根，其中加上一个很小值是为了防止除0的出现。由于是该项逐渐增加的，那么学习速率是衰减的。考虑如图2所示的情况，目标函数在两个方向的坡度不一样，如果是原始的梯度下降算法，在接近坡底时收敛速度比较慢。而当采用AdaGrad，这种情况可以被改观。由于比较陡的方向梯度比较大，其学习速率将衰减得更快，这有利于参数沿着更接近坡底的方向移动，从而加速收敛。

图2 AdaGrad效果图

前面说到AdaGrad其学习速率实际上是不断衰减的，这会导致一个很大的问题，就是训练后期学习速率很小，导致训练过早停止，因此在实际中AdaGrad一般不会被采用，下面的算法将改进这一致命缺陷。不过TensorFlow也提供了这一优化器：tf.train.AdagradOptimizer。

RMSprop

RMSprop是Hinton在他的课程上讲到的，其算是对Adagrad算法的改进，主要是解决学习速率过快衰减的问题。其实思路很简单，类似Momentum思想，引入一个超参数，在积累梯度平方项进行衰减：

可以认为仅仅对距离时间较近的梯度进行积累，其中一般取值0.9，其实这样就是一个指数衰减的均值项，减少了出现的爆炸情况，因此有助于避免学习速率很快下降的问题。同时Hinton也建议学习速率设置为0.001。RMSprop是属于一种比较好的优化算法了，在TensorFlow中当然有其身影：tf.train.RMSPropOptimizer(learning_rate=learning_rate,momentum=0.9, decay=0.9, epsilon=1e-10)。

不得不说点题外话，同时期还有一个Adadelta算法，其也是Adagrad算法的改进，而且改进思路和RMSprop很像，但是其背后是基于一次梯度近似代替二次梯度的思想，感兴趣的可以看看相应的论文，这里不再赘述。

Adam

Adam全称Adaptive moment estimation，是Kingma等在2015年提出的一种新的优化算法，其结合了Momentum和RMSprop算法的思想。相比Momentum算法，其学习速率是自适应的，而相比RMSprop，其增加了冲量项。所以，Adam是两者的结合体：

可以看到前两项和Momentum和RMSprop是非常一致的，由于和的初始值一般设置为0，在训练初期其可能较小，第三和第四项主要是为了放大它们。最后一项是参数更新。其中超参数的建议值是。Adm是性能非常好的算法，在TensorFlow其实现如下： tf.train.AdamOptimizer(learning_rate=0.001,beta1=0.9, beta2=0.999, epsilon=1e-08)。

二

学习速率

前面也说过学习速率的问题，对于梯度下降算法，这应该是一个最重要的超参数。如果学习速率设置得非常大，那么训练可能不会收敛，就直接发散了；如果设置的比较小，虽然可以收敛，但是训练时间可能无法接受；如果设置的稍微高一些，训练速度会很快，但是当接近最优点会发生震荡，甚至无法稳定。不同学习速率的选择影响可能非常大，如图3所示。

图3 不同学习速率的训练效果

理想的学习速率是：刚开始设置较大，有很快的收敛速度，然后慢慢衰减，保证稳定到达最优点。所以，前面的很多算法都是学习速率自适应的。除此之外，还可以手动实现这样一个自适应过程，如实现学习速率指数式衰减：

在TensorFlow中，你可以这样实现：

initial_learning_rate = 0.1
decay_steps = 10000
decay_rate = 1/10
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(initial_learning_rate,                           
                            global_step, decay_steps, decay_rate)
# decayed_learning_rate = learning_rate *
#                decay_rate ^ (global_step / decay_steps)
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)
training_op = optimizer.minimize(loss, global_step=global_step)

三总结

本文简单介绍了梯度下降算法的分类以及常用的改进算法，总结来看，优先选择学习速率自适应的算法如RMSprop和Adam算法，大部分情况下其效果是较好的。还有一定要特别注意学习速率的问题。其实还有很多方面会影响梯度下降算法，如梯度的消失与爆炸，这也是要额外注意的。最后不得不说，梯度下降算法目前无法保证全局收敛还将是一个持续性的数学难题。

四参考文献

Anoverview of gradient descent optimization algorithms: http://sebastianruder.com/optimizing-gradient-descent/.
Hands-OnMachine Learning with Scikit-Learn and TensorFlow, Aurélien Géron, 2017.
NAG:http://proceedings.mlr.press/v28/sutskever13.pdf.
Adagrad:http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
RMSprop:http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf.
Adadelta:https://arxiv.org/pdf/1212.5701v1.pdf.
Adam:https://arxiv.org/pdf/1412.6980.pdf.
不同的算法的效果可视化：https://imgur.com/a/Hqolp.

逻辑回归评分卡

https://blog.csdn.net/lll1528238733/article/details/76601897

由逻辑回归的基本原理，我们将客户违约的概率表示为p，则正常的概率为1-p。因此，可以得到：

此时，客户违约的概率p可表示为：

评分卡设定的分值刻度可以通过将分值表示为比率对数的线性表达式来定义，即可表示为下式：

其中，A和B是常数。式中的负号可以使得违约概率越低，得分越高。通常情况下，这是分值的理想变动方向，即高分值代表低风险，低分值代表高风险。
逻辑回归模型计算比率如下所示：

其中，用建模参数拟合模型可以得到模型参数β0，β1，…，βn。β0，β1，…，βn。
式中的常数A、B的值可以通过将两个已知或假设的分值带入计算得到。通常情况下，需要设定两个假设：
（1）给某个特定的比率设定特定的预期分值；
（2）确定比率翻番的分数（PDO）
根据以上的分析，我们首先假设比率为x的特定点的分值为P。则比率为2x的点的分值应该为P+PDO。代入式中，可以得到如下两个等式：

假设设定评分卡刻度使得比率为{1:20}（违约正常比）时的分值为50分，PDO为10分，代入式中求得：B=14.43，A=6.78
则分值的计算公式可表示为：

评分卡刻度参数A和B确定以后，就可以计算比率和违约概率，以及对应的分值了。通常将常数A称为补偿，常数B称为刻度。
则评分卡的分值可表达为：

式中：变量x1…xnx1…xn是出现在最终模型中的自变量，即为入模指标。由于此时所有变量都用WOE转换进行了转换，可以将这些自变量中的每一个都写(βiωij)δij(βiωij)δij的形式：

式中ωijωij 为第i行第j个变量的WOE，为已知变量；βiβi为逻辑回归方程中的系数，为已知变量；δijδij为二元变量，表示变量i是否取第j个值。上式可重新表示为：

此式即为最终评分卡公式。如果x1…xnx1…xn变量取不同行并计算其WOE值，式中表示的标准评分卡格式，如表3.20所示：
表3.20表明，变量x1有k1行，变量x2有k2行x1有k1行，变量x2有k2行，以此类推；基础分值等于(A−Bβ0)(A−Bβ0)；由于分值分配公式中的负号，模型参数β0，β1，…，βnβ0，β1，…，βn也应该是负值；变量xixi的第j行的分值取决于以下三个数值：

（1）刻度因子B；
（2）逻辑回归方程的参数βiβi；
（3）该行的WOE值，ωijωij
综上，我们详细讲述了模型开发及生成标准评分卡各步骤的处理结果，自动生成标准评分卡的R完整代码：

最终评分卡展示

需要特别说明的是，上述开发的信用风险评级模型只包含定量和定性两部分，在实际的使用中还要充分考虑到信用风险的特定，增加综合调整部分，以应对可能对客户信用影响较大的突发事件，如客户被刑事起诉、遭遇重大疾病等。完整的信用风险标准评分卡模型，如表3.21所示：

Python代码，数据库：美国UCI的德国信用评分卡

　sklearn脚本

改善空间：变量去处p值大于0.05变量，比较混淆矩阵结果

# -*- coding: utf-8 -*-
"""
技术文档
https://www.cnblogs.com/webRobot/p/7216614.html
model accuracy is: 0.755
model precision is: 0.697841726618705
model sensitivity is: 0.3233333333333333
f1_score: 0.44191343963553525
AUC: 0.7626619047619048
"""
import math
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
import statsmodels.api as sm
#混淆矩阵计算
from sklearn import metrics
from sklearn.metrics import roc_curve, auc,roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_scoredf_german=pd.read_excel("german_woe.xlsx")
y=df_german["target"]
x=df_german.ix[:,"Duration of Credit (month)":"Foreign Worker"]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)classifier = LogisticRegression()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)#交叉验证
print('Accuracy:', accuracy_score(y_test, predictions))
scores = cross_val_score(classifier, X_train, y_train, cv=10)
print(np.mean(scores), scores)#得分公式
'''
P0 = 50
PDO = 30
theta0 = 1.0/20
B = PDO/np.log(2)
A = P0 + B*np.log(theta0)
P0 = 50
PDO = 30
theta0 = 1.0/60
'''
def Score(probability):
    #底数是e
    score = A-B*np.log(probability/(1-probability))
    return score
#批量获取得分
def List_score(pos_probablity_list):
    list_score=[]
    for probability in pos_probablity_list:
        score=Score(probability)
        list_score.append(score)
    return list_scoreP0 = 50
PDO = 10
theta0 = 1.0/20
B = PDO/np.log(2)
A = P0 + B*np.log(theta0)
print("A:",A)
print("B:",B)
list_coef = list(classifier.coef_[0])
intercept= classifier.intercept_#获取所有x数据的预测概率,包括好客户和坏客户，0为好客户，1为坏客户
probablity_list=classifier.predict_proba(x)
#获取所有x数据的坏客户预测概率
pos_probablity_list=[i[1] for i in probablity_list]
#获取所有客户分数
list_score=List_score(pos_probablity_list)
list_predict=classifier.predict(x)
df_result=pd.DataFrame({"label":y,"predict":list_predict,"pos_probablity":pos_probablity_list,"score":list_score})df_result.to_excel("score_proba.xlsx")#变量名列表
list_vNames=df_german.columns
#去掉第一个变量名target
list_vNames=list_vNames[1:]
df_coef=pd.DataFrame({"variable_names":list_vNames,"coef":list_coef})
df_coef.to_excel("coef.xlsx")y_true=y
y_pred=list_predict
accuracyScore = accuracy_score(y_true, y_pred)
print('model accuracy is:',accuracyScore)#precision,TP/(TP+FP) （真阳性）/（真阳性+假阳性）
precision=precision_score(y_true, y_pred)
print('model precision is:',precision)#recall（sensitive）敏感度，(TP)/（TP+FN）
sensitivity=recall_score(y_true, y_pred)
print('model sensitivity is:',sensitivity)#F1 = 2 x (精确率 x 召回率) / (精确率 + 召回率)
#F1 分数会同时考虑精确率和召回率，以便计算新的分数。可将 F1 分数理解为精确率和召回率的加权平均值，其中 F1 分数的最佳值为 1、最差值为 0：
f1Score=f1_score(y_true, y_pred)
print("f1_score:",f1Score)def AUC(y_true, y_scores):
    auc_value=0
    #auc第二种方法是通过fpr,tpr，通过auc(fpr,tpr)来计算AUC
    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_scores, pos_label=1)
    auc_value= auc(fpr,tpr) ###计算auc的值
    #print("fpr:",fpr)
    #print("tpr:",tpr)
    #print("thresholds:",thresholds)
    if auc_value<0.5:
        auc_value=1-auc_value
    return auc_valuedef Draw_roc(auc_value):
    fpr, tpr, thresholds = metrics.roc_curve(y, list_score, pos_label=0)
    #画对角线
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Diagonal line')
    plt.plot(fpr,tpr,label='ROC curve (area = %0.2f)' % auc_value)
    plt.title('ROC curve')
    plt.legend(loc="lower right")#评价AUC表现
def AUC_performance(AUC):
    if AUC >=0.7:
        print("good classifier")
    if 0.7>AUC>0.6:
        print("not very good classifier")
    if 0.6>=AUC>0.5:
        print("useless classifier")
    if 0.5>=AUC:
        print("bad classifier,with sorting problems")#Auc验证，数据采用测试集数据
auc_value=AUC(y, list_score)
print("AUC:",auc_value)
#评价AUC表现
AUC_performance(auc_value)
#绘制ROC曲线
Draw_roc(auc_value)

statesmodel脚本

# -*- coding: utf-8 -*-
"""
statmodel 的逻辑回归模型
"""
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
import statsmodels.api as sm
#混淆矩阵计算
from sklearn import metrics
from sklearn.metrics import roc_curve, auc,roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_scoredf_german=pd.read_excel("german_woe.xlsx")
y=df_german["target"]
x=df_german.ix[:,"Duration of Credit (month)":"Foreign Worker"]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)#pvs = []
logit_mod = sm.Logit(y_train, X_train)
logitres = logit_mod.fit(disp=False)
predict_y=logitres.predict(x)summary_logistic=logitres.summary()
#pvs.append([item, logitres.pvalues[item]])
file_logistic_summary=open("summary_logistic.txt",'w')
file_logistic_summary.write(str(summary_logistic))
file_logistic_summary.close()#验证
def AUC(y_true, y_scores):
    auc_value=0
    #auc第二种方法是通过fpr,tpr，通过auc(fpr,tpr)来计算AUC
    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_scores, pos_label=1)
    auc_value= auc(fpr,tpr) ###计算auc的值
    #print("fpr:",fpr)
    #print("tpr:",tpr)
    #print("thresholds:",thresholds)
    if auc_value<0.5:
        auc_value=1-auc_value
    return auc_valuedef Draw_roc(auc_value):
    fpr, tpr, thresholds = metrics.roc_curve(y, predict_y, pos_label=1)
    #画对角线
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Diagonal line')
    plt.plot(fpr,tpr,label='ROC curve (area = %0.2f)' % auc_value)
    plt.title('ROC curve')
    plt.legend(loc="lower right")#评价AUC表现
def AUC_performance(AUC):
    if AUC >=0.7:
        print("good classifier")
    if 0.7>AUC>0.6:
        print("not very good classifier")
    if 0.6>=AUC>0.5:
        print("useless classifier")
    if 0.5>=AUC:
        print("bad classifier,with sorting problems")#Auc验证，数据采用测试集数据
auc_value=AUC(y, predict_y)
print("AUC:",auc_value)
#评价AUC表现
AUC_performance(auc_value)
#绘制ROC曲线
Draw_roc(auc_value)

statesmodel和sklearn计算系数结果有差异，但auc值差不多，为0.76

样本测试：

woe,coef,odds,坏客户概率，好客户概率之间关系，手动计算结果0.547和机器计算值0.547一致。

坏客户计算公式

R代码，数据库：美国UCI的德国信用评分卡

library(klaR)
library(InformationValue)
data(GermanCredit)
train_kfold<-sample(nrow(GermanCredit),800,replace = F)
train_kfolddata<-GermanCredit[train_kfold,]   #提取样本数据集
test_kfolddata<-GermanCredit[-train_kfold,]   #提取测试数据集
credit_risk<-ifelse(train_kfolddata[,"credit_risk"]=="good",0,1)
#将违约样本用“1”表示，正常样本用“0”表示。
tmp<-train_kfolddata[,-21]
data<-cbind(tmp,credit_risk)
quant_vars<-c("duration","amount","installment_rate","present_residence","age",
              "number_credits","people_liable","credit_risk")
             #获取定量指标
quant_GermanCredit<-data[,quant_vars]  #提取定量指标#逐步回归法，获取自变量中对违约状态影响最显著的指标
base.mod<-lm(credit_risk~1,data = quant_GermanCredit)
#获取线性回归模型的截距
all.mod<-lm(credit_risk~.,data = quant_GermanCredit)
#获取完整的线性回归模型
stepMod<-step(base.mod,scope = list(lower=base.mod,upper=all.mod),
              direction = "both",trace = 0,steps = 1000)
#采用双向逐步回归法，筛选变量
shortlistedVars<-names(unlist(stepMod[[1]]))
#获取逐步回归得到的变量列表
shortlistedVars<-shortlistedVars[!shortlistedVars %in%"(Intercept)"]
#删除逐步回归的截距
print(shortlistedVars)
#输出逐步回归后得到的变量
quant_model_vars<-c("duration","amount","installment_rate","age")
#完成定量入模指标
#提取数据集中全部的定性指标
factor_vars<-c("status","credit_history","purpose","savings","employment_duration",
               "personal_status_sex","other_debtors","property",
               "other_installment_plans","housing","job","telephone","foreign_worker")
               #获取所有名义变量
all_iv<-data.frame(VARS=factor_vars,IV=numeric(length(factor_vars)),
                   STRENGTH=character(length(factor_vars)),stringsAsFactors = F)
                  #初始化待输出的数据框
for(factor_var in factor_vars)
{
  all_iv[all_iv$VARS==factor_var,"IV"]<-InformationValue::IV(X=
  data[,factor_var],Y=data$credit_risk)
  #计算每个指标的IV值
  all_iv[all_iv$VARS==factor_var,"STRENGTH"]<-attr(InformationValue::IV(X=
  data[,factor_var],Y=data$credit_risk),"howgood")
  #提取每个IV指标的描述
}
all_iv<-all_iv[order(-all_iv$IV),]    #排序IV
qual_model_vars<-subset(all_iv,STRENGTH=="Highly Predictive")[1:5,]
qual_model_vars<-c("status","credit_history","savings","purpose","property")#连续变量分段和离散变量降维
#1.变量duration
library(smbinning)
result<-smbinning(df=data,y="credit_risk",x="duration",p=0.05)
result$ivtableduration_Cutpoint<-c()
duration_WoE<-c()
duration<-data[,"duration"]
for(i in 1:length(duration))
{
  if(duration[i]<=8)
  {
    duration_Cutpoint[i]<-"<= 8"
    duration_WoE[i]<--1.5670
  }
  if(duration[i]<=33&duration[i]>8)
  {
    duration_Cutpoint[i]<-"<= 33"
    duration_WoE[i]<--0.0924
  }
  if(duration[i]> 33)
  {
    duration_Cutpoint[i]<-"> 33"
    duration_WoE[i]<-0.7863
  }
}
#2.变量amount
result<-smbinning(df=data,y="credit_risk",x="amount",p=0.05)
result$ivtable
amount_Cutpoint<-c()
amount_WoE<-c()
amount<-data[,"amount"]
for(i in 1:length(amount))
{
  if(amount[i]<= 3913)
  {
    amount_Cutpoint[i]<-"<= 3913"
    amount_WoE[i]<--0.2536
  }
  if(amount[i]<= 9283&amount[i]> 3913)
  {
    amount_Cutpoint[i]<-"<= 9283"
    amount_WoE[i]<-0.4477
  }
  if(amount[i]> 9283)
  {
    amount_Cutpoint[i]<-"> 9283"
    amount_WoE[i]<-1.3109
  }
}
#3.变量age
result<-smbinning(df=data,y="credit_risk",x="age",p=0.05)
result$ivtable
age_Cutpoint<-c()
age_WoE<-c()
age<-data[,"age"]
for(i in 1:length(age))
{
  if(age[i]<= 34)
  {
    age_Cutpoint[i]<-"<= 34"
    age_WoE[i]<-0.2279
  }
  if(age[i] > 34)
  {
    age_Cutpoint[i]<-" > 34"
    age_WoE[i]<--0.3059
  }
}
#4.变量installment_rate等距分段
install_data<-data[,c("installment_rate","credit_risk")]
tb1<-table(install_data)
total<-list()
for(i in 1:nrow(tb1))
{
  total[i]<-sum(tb1[i,])
}
t.tb1<-cbind(tb1,total)
goodrate<-as.numeric(t.tb1[,"0"])/as.numeric(t.tb1[,"total"])
badrate<-as.numeric(t.tb1[,"1"])/as.numeric(t.tb1[,"total"])
gb.tbl<-cbind(t.tb1,goodrate,badrate)
Odds<-goodrate/badrate
LnOdds<-log(Odds)
tt.tb1<-cbind(gb.tbl,Odds,LnOdds)
WoE<-log((as.numeric(tt.tb1[,"0"])/700)/(as.numeric(tt.tb1[,"1"])/300))
all.tb1<-cbind(tt.tb1,WoE)
all.tb1
installment_rate_Cutpoint<-c()
installment_rate_WoE<-c()
installment_rate<-data[,"installment_rate"]
for(i in 1:length(installment_rate))
{
  if(installment_rate[i]==1)
  {
    installment_rate_Cutpoint[i]<-"=1"
    installment_rate_WoE[i]<-0.06252036
  }
  if(installment_rate[i]==2)
  {
    installment_rate_Cutpoint[i]<-"=2"
    installment_rate_WoE[i]<-0.1459539
  }
  if(installment_rate[i]==3)
  {
    installment_rate_Cutpoint[i]<-"=3"
    installment_rate_WoE[i]<--0.03937517
  }
  if(installment_rate[i]==4)
  {
    installment_rate_Cutpoint[i]<-"=4"
    installment_rate_WoE[i]<--0.1657562
  }
}
#定性指标的降维和WoE
discrete_data<-data[,c("status","credit_history","savings","purpose",
                       "property","credit_risk")]
summary(discrete_data)
#对purpose指标进行降维
x<-discrete_data[,c("purpose","credit_risk")]
d<-as.matrix(x)
for(i in 1:nrow(d))
{
  #合并car(new)、car(used)
  if(as.character(d[i,"purpose"])=="car (new)")
  {
    d[i,"purpose"]<-as.character("car(new/used)")
  }
  if(as.character(d[i,"purpose"])=="car (used)")
  {
    d[i,"purpose"]<-as.character("car(new/used)")
  }
  #合并radio/television、furniture/equipment
  if(as.character(d[i,"purpose"])=="radio/television")
  {
    d[i,"purpose"]<-as.character("radio/television/furniture/equipment")
  }
  if(as.character(d[i,"purpose"])=="furniture/equipment")
  {
    d[i,"purpose"]<-as.character("radio/television/furniture/equipment")
  }
  #合并others、repairs、business
  if(as.character(d[i,"purpose"])=="others")
  {
    d[i,"purpose"]<-as.character("others/repairs/business")
  }
  if(as.character(d[i,"purpose"])=="repairs")
  {
    d[i,"purpose"]<-as.character("others/repairs/business")
  }
  if(as.character(d[i,"purpose"])=="business")
  {
    d[i,"purpose"]<-as.character("others/repairs/business")
  }
  #合并retraining、education
  if(as.character(d[i,"purpose"])=="retraining")
  {
    d[i,"purpose"]<-as.character("retraining/education")
  }
  if(as.character(d[i,"purpose"])=="education")
  {
    d[i,"purpose"]<-as.character("retraining/education")
  }
}new_data<-cbind(discrete_data[,c(-4,-6)],d)
#替换原数据集中的“purpose”指标的值
woemodel<-woe(credit_risk~.,data = new_data,zeroadj=0.5,applyontrain=TRUE)
woemodel$woe
#1.status
status<-as.matrix(new_data[,"status"])
colnames(status)<-"status"
status_WoE<-c()
for(i in 1:length(status))
{
  if(status[i]=="... < 100 DM")
  {
    status_WoE[i]<--0.8671300
  }
  if(status[i]=="0 <= ... < 200 DM")
  {
    status_WoE[i]<--0.4240681
  }
  if(status[i]=="... >= 200 DM / salary for at least 1 year")
  {
    status_WoE[i]<-0.4129033
  }
  if(status[i]=="no checking account")
  {
    status_WoE[i]<-1.2237524
  }
}
#2.credit_history
credit_history<-as.matrix(new_data[,"credit_history"])
colnames(credit_history)<-"credit_history"
credit_history_WoE<-c()
for(i in 1:length(credit_history))
{
  if(credit_history[i]=="no credits taken/all credits paid back duly")
  {
    credit_history_WoE[i]<--1.53771824
  }
  if(credit_history[i]=="all credits at this bank paid back duly")
  {
    credit_history_WoE[i]<--1.00079000
  }
  if(credit_history[i]=="existing credits paid back duly till now")
  {
    credit_history_WoE[i]<--0.09646414
  }
  if(credit_history[i]=="delay in paying off in the past")
  {
    credit_history_WoE[i]<--0.01996074
  }
  if(credit_history[i]=="critical account/other credits existing")
  {
    credit_history_WoE[i]<-0.77276102
  }
}
#3.savings
savings<-as.matrix(new_data[,"savings"])
colnames(savings)<-"savings"
savings_WoE<-c()
for(i in 1:length(savings))
{
  if(savings[i]=="... < 100 DM")
  {
    savings_WoE[i]<--0.3051490
  }
  if(savings[i]=="100 <= ... < 500 DM")
  {
    savings_WoE[i]<--0.2267733
  }
  if(savings[i]=="500 <= ... < 1000 DM")
  {
    savings_WoE[i]<-0.8340112
  }
  if(savings[i]=="... >= 1000 DM")
  {
    savings_WoE[i]<-1.1739617
  }
  if(savings[i]=="unknown/no savings account")
  {
    savings_WoE[i]<-0.7938144
  }
}
#4.property
property<-as.matrix(new_data[,"property"])
colnames(property)<-"property"
property_WoE<-c()
for(i in 1:length(property))
{
  if(property[i]=="real estate")
  {
    property_WoE[i]<-0.49346566
  }
  if(property[i]=="building society savings agreement/life insurance")
  {
    property_WoE[i]<--0.16507975
  }
  if(property[i]=="car or other")
  {
    property_WoE[i]<-0.08054425
  }
  if(property[i]=="unknown/no property")
  {
    property_WoE[i]<--0.65586969
  }
}
#5.purpose
purpose<-as.matrix(new_data[,"purpose"])
colnames(purpose)<-"purpose"
purpose_WoE<-c()
for(i in 1:length(purpose))
{
  if(purpose[i]=="car(new/used)")
  {
    purpose_WoE[i]<--0.11260594
  }
  if(purpose[i]=="domestic appliances")
  {
    purpose_WoE[i]<-0.53602528
  }
  if(purpose[i]=="others/repairs/business")
  {
    purpose_WoE[i]<--0.09146793
  }
  if(purpose[i]=="radio/television/furniture/equipment")
  {
    purpose_WoE[i]<--0.23035114
  }
  if(purpose[i]=="retraining/education")
  {
    purpose_WoE[i]<--0.43547619
  }
}
#入模定量和定性指标
model_data<-cbind(data[,quant_model_vars],data[,qual_model_vars])
#入模定量和定性指标的WOE
credit_risk<-as.matrix(data[,"credit_risk"])
colnames(credit_risk)<-"credit_risk"
model_data_WOE<-as.data.frame(cbind(duration_WoE,amount_WoE,age_WoE,
                installment_rate_WoE,status_WoE,credit_history_WoE,
                savings_WoE,property_WoE,purpose_WoE,credit_risk))
#入模定量和定性指标“分段”
model_data_Cutpoint<-cbind(duration_Cutpoint,amount_Cutpoint,age_Cutpoint,
                     installment_rate_Cutpoint,status,credit_history,
                     savings,property,purpose)
#逻辑回归
m<-glm(credit_risk~.,data=model_data_WOE,family = binomial())
alpha_beta<-function(basepoints,baseodds,pdo)
{
  beta<-pdo/log(2)
  alpha<-basepoints+beta*log(baseodds)
  return(list(alpha=alpha,beta=beta))
}
coefficients<-m$coefficients
#通过指定特定比率（1/20）的特定分值（50）和比率翻番的分数（10），来计算评分卡的系数alpha和beta
x<-alpha_beta(50,0.05,10)
#计算基础分值
basepoint<-round(x$alpha-x$beta*coefficients[1])
#1.duration_score
duration_score<-round(as.matrix(-(model_data_WOE[,"duration_WoE"]*
                                    coefficients["duration_WoE"]*x$beta)))
colnames(duration_score)<-"duration_score"
#2.amount_score
amount_score<-round(as.matrix(-(model_data_WOE[,"amount_WoE"]*
                                  coefficients["amount_WoE"]*x$beta)))
colnames(amount_score)<-"amount_score"
#3.age_score
age_score<-round(as.matrix(-(model_data_WOE[,"age_WoE"]*
                                  coefficients["age_WoE"]*x$beta)))
colnames(age_score)<-"age_score"
#4.installment_rate_score
installment_rate_score<-round(as.matrix(-(model_data_WOE[,"installment_rate_WoE"]*
                                  coefficients["installment_rate_WoE"]*x$beta)))
colnames(installment_rate_score)<-"installment_rate_score"
#5.status_score
status_score<-round(as.matrix(-(model_data_WOE[,"status_WoE"]*
                               coefficients["status_WoE"]*x$beta)))
colnames(status_score)<-"status_score"
#6.credit_history_score
credit_history_score<-round(as.matrix(-(model_data_WOE[,"credit_history_WoE"]*
                                  coefficients["credit_history_WoE"]*x$beta)))
colnames(credit_history_score)<-"credit_history_score"
#7.savings_score
savings_score<-round(as.matrix(-(model_data_WOE[,"savings_WoE"]*
                                          coefficients["savings_WoE"]*x$beta)))
colnames(savings_score)<-"savings_score"
#8.property_score
property_score<-round(as.matrix(-(model_data_WOE[,"property_WoE"]*
                                   coefficients["property_WoE"]*x$beta)))
colnames(property_score)<-"property_score"
#9.purpose_score
purpose_score<-round(as.matrix(-(model_data_WOE[,"purpose_WoE"]*
                                    coefficients["purpose_WoE"]*x$beta)))
colnames(purpose_score)<-"purpose_score"
#输出最终的CSV格式的打分卡
#1.基础分值
r1<-c("","basepoint",20)
m1<-matrix(r1,nrow = 1)
colnames(m1)<-c("Basepoint","Basepoint","Score")
#2.duration的分值
duration_scoreCard<-cbind(as.matrix(c("Duration","",""),ncol=1),
                    unique(cbind(duration_Cutpoint,duration_score)))
#View(duration_scoreCard)
#3.amount的分值
amount_scoreCard<-cbind(as.matrix(c("Amount","",""),ncol=1),
                          unique(cbind(amount_Cutpoint,amount_score)))
#View(amount_scoreCard)
#4.age的分值
age_scoreCard<-cbind(as.matrix(c("Age",""),ncol=1),
                        unique(cbind(age_Cutpoint,age_score)))
#View(age_scoreCard)
#5.installment_rate的分值
installment_rate_scoreCard<-cbind(as.matrix(c("Installment_rate","","",""),ncol=1),
                     unique(cbind(installment_rate_Cutpoint,installment_rate_score)))
#View(installment_rate_scoreCard)
#6.status的分值
status_scoreCard<-cbind(as.matrix(c("Status","","",""),ncol=1),
                                  unique(cbind(status,status_score)))
#View(status_scoreCard)
#7.credit_history的分值
credit_history_scoreCard<-cbind(as.matrix(c("Credit_history","","","",""),ncol=1),
                        unique(cbind(credit_history,credit_history_score)))
#View(credit_history_scoreCard)
#8.savings的分值
savings_scoreCard<-cbind(as.matrix(c("Savings","","","",""),ncol=1),
                                unique(cbind(savings,savings_score)))
#View(savings_scoreCard)
#9.property的分值
property_scoreCard<-cbind(as.matrix(c("Property","","",""),ncol=1),
                         unique(cbind(property,property_score)))
#View(property_scoreCard)
#10.purpose的分值
purpose_scoreCard<-cbind(as.matrix(c("Purpose","","","",""),ncol=1),
                          unique(cbind(purpose,purpose_score)))
#View(purpose_scoreCard)
scoreCard_CSV<-rbind(m1,duration_scoreCard,amount_scoreCard,age_scoreCard,
                     installment_rate_scoreCard,status_scoreCard,credit_history_scoreCard,
                     savings_scoreCard,property_scoreCard,purpose_scoreCard)
#将标准评分卡输出到项目文件中，且命名为ScoreCard.CSV，调整格式即可得到标准评分卡
write.csv(scoreCard_CSV,"C:/Users/ZL/Desktop/creditcard_model/ScoreCard.CSV")

sklearn逻辑回归的源码，57位开发者贡献　　

https://github.com/scikit-learn/scikit-learn/blob/bac89c2/sklearn/linear_model/logistic.py#L997

"""
Logistic Regression
"""# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
#         Fabian Pedregosa <f@bianp.net>
#         Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
#         Manoj Kumar <manojkumarsivaraj334@gmail.com>
#         Lars Buitinck
#         Simon Wu <s8wu@uwaterloo.ca>
#         Arthur Mensch <arthur.mensch@m4x.orgimport numbers
import warningsimport numpy as np
from scipy import optimize, sparse
from scipy.special import expitfrom .base import LinearClassifierMixin, SparseCoefMixin, BaseEstimator
from .sag import sag_solver
from ..preprocessing import LabelEncoder, LabelBinarizer
from ..svm.base import _fit_liblinear
from ..utils import check_array, check_consistent_length, compute_class_weight
from ..utils import check_random_state
from ..utils.extmath import (log_logistic, safe_sparse_dot, softmax,
                             squared_norm)
from ..utils.extmath import row_norms
from ..utils.fixes import logsumexp
from ..utils.optimize import newton_cg
from ..utils.validation import check_X_y
from ..exceptions import (NotFittedError, ConvergenceWarning,
                          ChangedBehaviorWarning)
from ..utils.multiclass import check_classification_targets
from ..utils import Parallel, delayed, effective_n_jobs
from ..model_selection import check_cv
from ..externals import six
from ..metrics import get_scorer# .. some helper functions for logistic_regression_path ..
def _intercept_dot(w, X, y):
    """Computes y * np.dot(X, w).    It takes into consideration if the intercept should be fit or not.    Parameters
    ----------
    w : ndarray, shape (n_features,) or (n_features + 1,)
        Coefficient vector.    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.    y : ndarray, shape (n_samples,)
        Array of labels.    Returns
    -------
    w : ndarray, shape (n_features,)
        Coefficient vector without the intercept weight (w[-1]) if the
        intercept should be fit. Unchanged otherwise.    c : float
        The intercept.    yz : float
        y * np.dot(X, w).
    """
    c = 0.
    if w.size == X.shape[1] + 1:
        c = w[-1]
        w = w[:-1]    z = safe_sparse_dot(X, w) + c
    yz = y * z
    return w, c, yzdef _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None):
    """Computes the logistic loss and gradient.    Parameters
    ----------
    w : ndarray, shape (n_features,) or (n_features + 1,)
        Coefficient vector.    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.    y : ndarray, shape (n_samples,)
        Array of labels.    alpha : float
        Regularization parameter. alpha is equal to 1 / C.    sample_weight : array-like, shape (n_samples,) optional
        Array of weights that are assigned to individual samples.
        If not provided, then each sample is given unit weight.    Returns
    -------
    out : float
        Logistic loss.    grad : ndarray, shape (n_features,) or (n_features + 1,)
        Logistic gradient.
    """
    n_samples, n_features = X.shape
    grad = np.empty_like(w)    w, c, yz = _intercept_dot(w, X, y)    if sample_weight is None:
        sample_weight = np.ones(n_samples)    # Logistic loss is the negative of the log of the logistic function.
    out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(w, w)    z = expit(yz)
    z0 = sample_weight * (z - 1) * y    grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w    # Case where we fit the intercept.
    if grad.shape[0] > n_features:
        grad[-1] = z0.sum()
    return out, graddef _logistic_loss(w, X, y, alpha, sample_weight=None):
    """Computes the logistic loss.    Parameters
    ----------
    w : ndarray, shape (n_features,) or (n_features + 1,)
        Coefficient vector.    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.    y : ndarray, shape (n_samples,)
        Array of labels.    alpha : float
        Regularization parameter. alpha is equal to 1 / C.    sample_weight : array-like, shape (n_samples,) optional
        Array of weights that are assigned to individual samples.
        If not provided, then each sample is given unit weight.    Returns
    -------
    out : float
        Logistic loss.
    """
    w, c, yz = _intercept_dot(w, X, y)    if sample_weight is None:
        sample_weight = np.ones(y.shape[0])    # Logistic loss is the negative of the log of the logistic function.
    out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(w, w)
    return outdef _logistic_grad_hess(w, X, y, alpha, sample_weight=None):
    """Computes the gradient and the Hessian, in the case of a logistic loss.    Parameters
    ----------
    w : ndarray, shape (n_features,) or (n_features + 1,)
        Coefficient vector.    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.    y : ndarray, shape (n_samples,)
        Array of labels.    alpha : float
        Regularization parameter. alpha is equal to 1 / C.    sample_weight : array-like, shape (n_samples,) optional
        Array of weights that are assigned to individual samples.
        If not provided, then each sample is given unit weight.    Returns
    -------
    grad : ndarray, shape (n_features,) or (n_features + 1,)
        Logistic gradient.    Hs : callable
        Function that takes the gradient as a parameter and returns the
        matrix product of the Hessian and gradient.
    """
    n_samples, n_features = X.shape
    grad = np.empty_like(w)
    fit_intercept = grad.shape[0] > n_features    w, c, yz = _intercept_dot(w, X, y)    if sample_weight is None:
        sample_weight = np.ones(y.shape[0])    z = expit(yz)
    z0 = sample_weight * (z - 1) * y    grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w    # Case where we fit the intercept.
    if fit_intercept:
        grad[-1] = z0.sum()    # The mat-vec product of the Hessian
    d = sample_weight * z * (1 - z)
    if sparse.issparse(X):
        dX = safe_sparse_dot(sparse.dia_matrix((d, 0),
                             shape=(n_samples, n_samples)), X)
    else:
        # Precompute as much as possible
        dX = d[:, np.newaxis] * X    if fit_intercept:
        # Calculate the double derivative with respect to intercept
        # In the case of sparse matrices this returns a matrix object.
        dd_intercept = np.squeeze(np.array(dX.sum(axis=0)))    def Hs(s):
        ret = np.empty_like(s)
        ret[:n_features] = X.T.dot(dX.dot(s[:n_features]))
        ret[:n_features] += alpha * s[:n_features]        # For the fit intercept case.
        if fit_intercept:
            ret[:n_features] += s[-1] * dd_intercept
            ret[-1] = dd_intercept.dot(s[:n_features])
            ret[-1] += d.sum() * s[-1]
        return ret    return grad, Hsdef _multinomial_loss(w, X, Y, alpha, sample_weight):
    """Computes multinomial loss and class probabilities.    Parameters
    ----------
    w : ndarray, shape (n_classes * n_features,) or
        (n_classes * (n_features + 1),)
        Coefficient vector.    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.    Y : ndarray, shape (n_samples, n_classes)
        Transformed labels according to the output of LabelBinarizer.    alpha : float
        Regularization parameter. alpha is equal to 1 / C.    sample_weight : array-like, shape (n_samples,) optional
        Array of weights that are assigned to individual samples.
        If not provided, then each sample is given unit weight.    Returns
    -------
    loss : float
        Multinomial loss.    p : ndarray, shape (n_samples, n_classes)
        Estimated class probabilities.    w : ndarray, shape (n_classes, n_features)
        Reshaped param vector excluding intercept terms.    Reference
    ---------
    Bishop, C. M. (2006). Pattern recognition and machine learning.
    Springer. (Chapter 4.3.4)
    """
    n_classes = Y.shape[1]
    n_features = X.shape[1]
    fit_intercept = w.size == (n_classes * (n_features + 1))
    w = w.reshape(n_classes, -1)
    sample_weight = sample_weight[:, np.newaxis]
    if fit_intercept:
        intercept = w[:, -1]
        w = w[:, :-1]
    else:
        intercept = 0
    p = safe_sparse_dot(X, w.T)
    p += intercept
    p -= logsumexp(p, axis=1)[:, np.newaxis]
    loss = -(sample_weight * Y * p).sum()
    loss += 0.5 * alpha * squared_norm(w)
    p = np.exp(p, p)
    return loss, p, wdef _multinomial_loss_grad(w, X, Y, alpha, sample_weight):
    """Computes the multinomial loss, gradient and class probabilities.    Parameters
    ----------
    w : ndarray, shape (n_classes * n_features,) or
        (n_classes * (n_features + 1),)
        Coefficient vector.    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.    Y : ndarray, shape (n_samples, n_classes)
        Transformed labels according to the output of LabelBinarizer.    alpha : float
        Regularization parameter. alpha is equal to 1 / C.    sample_weight : array-like, shape (n_samples,) optional
        Array of weights that are assigned to individual samples.    Returns
    -------
    loss : float
        Multinomial loss.    grad : ndarray, shape (n_classes * n_features,) or
        (n_classes * (n_features + 1),)
        Ravelled gradient of the multinomial loss.    p : ndarray, shape (n_samples, n_classes)
        Estimated class probabilities    Reference
    ---------
    Bishop, C. M. (2006). Pattern recognition and machine learning.
    Springer. (Chapter 4.3.4)
    """
    n_classes = Y.shape[1]
    n_features = X.shape[1]
    fit_intercept = (w.size == n_classes * (n_features + 1))
    grad = np.zeros((n_classes, n_features + bool(fit_intercept)),
                    dtype=X.dtype)
    loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight)
    sample_weight = sample_weight[:, np.newaxis]
    diff = sample_weight * (p - Y)
    grad[:, :n_features] = safe_sparse_dot(diff.T, X)
    grad[:, :n_features] += alpha * w
    if fit_intercept:
        grad[:, -1] = diff.sum(axis=0)
    return loss, grad.ravel(), pdef _multinomial_grad_hess(w, X, Y, alpha, sample_weight):
    """
    Computes the gradient and the Hessian, in the case of a multinomial loss.    Parameters
    ----------
    w : ndarray, shape (n_classes * n_features,) or
        (n_classes * (n_features + 1),)
        Coefficient vector.    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.    Y : ndarray, shape (n_samples, n_classes)
        Transformed labels according to the output of LabelBinarizer.    alpha : float
        Regularization parameter. alpha is equal to 1 / C.    sample_weight : array-like, shape (n_samples,) optional
        Array of weights that are assigned to individual samples.    Returns
    -------
    grad : array, shape (n_classes * n_features,) or
        (n_classes * (n_features + 1),)
        Ravelled gradient of the multinomial loss.    hessp : callable
        Function that takes in a vector input of shape (n_classes * n_features)
        or (n_classes * (n_features + 1)) and returns matrix-vector product
        with hessian.    References
    ----------
    Barak A. Pearlmutter (1993). Fast Exact Multiplication by the Hessian.
        http://www.bcl.hamilton.ie/~barak/papers/nc-hessian.pdf
    """
    n_features = X.shape[1]
    n_classes = Y.shape[1]
    fit_intercept = w.size == (n_classes * (n_features + 1))    # `loss` is unused. Refactoring to avoid computing it does not
    # significantly speed up the computation and decreases readability
    loss, grad, p = _multinomial_loss_grad(w, X, Y, alpha, sample_weight)
    sample_weight = sample_weight[:, np.newaxis]    # Hessian-vector product derived by applying the R-operator on the gradient
    # of the multinomial loss function.
    def hessp(v):
        v = v.reshape(n_classes, -1)
        if fit_intercept:
            inter_terms = v[:, -1]
            v = v[:, :-1]
        else:
            inter_terms = 0
        # r_yhat holds the result of applying the R-operator on the multinomial
        # estimator.
        r_yhat = safe_sparse_dot(X, v.T)
        r_yhat += inter_terms
        r_yhat += (-p * r_yhat).sum(axis=1)[:, np.newaxis]
        r_yhat *= p
        r_yhat *= sample_weight
        hessProd = np.zeros((n_classes, n_features + bool(fit_intercept)))
        hessProd[:, :n_features] = safe_sparse_dot(r_yhat.T, X)
        hessProd[:, :n_features] += v * alpha
        if fit_intercept:
            hessProd[:, -1] = r_yhat.sum(axis=0)
        return hessProd.ravel()    return grad, hesspdef _check_solver(solver, penalty, dual):
    if solver == 'warn':
        solver = 'liblinear'
        warnings.warn("Default solver will be changed to 'lbfgs' in 0.22. "
                      "Specify a solver to silence this warning.",
                      FutureWarning)    all_solvers = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
    if solver not in all_solvers:
        raise ValueError("Logistic Regression supports only solvers in %s, got"
                         " %s." % (all_solvers, solver))    all_penalties = ['l1', 'l2']
    if penalty not in all_penalties:
        raise ValueError("Logistic Regression supports only penalties in %s,"
                         " got %s." % (all_penalties, penalty))    if solver not in ['liblinear', 'saga'] and penalty != 'l2':
        raise ValueError("Solver %s supports only l2 penalties, "
                         "got %s penalty." % (solver, penalty))
    if solver != 'liblinear' and dual:
        raise ValueError("Solver %s supports only "
                         "dual=False, got dual=%s" % (solver, dual))
    return solverdef _check_multi_class(multi_class, solver, n_classes):
    if multi_class == 'warn':
        multi_class = 'ovr'
        if n_classes > 2:
            warnings.warn("Default multi_class will be changed to 'auto' in"
                          " 0.22. Specify the multi_class option to silence "
                          "this warning.", FutureWarning)
    if multi_class == 'auto':
        if solver == 'liblinear':
            multi_class = 'ovr'
        elif n_classes > 2:
            multi_class = 'multinomial'
        else:
            multi_class = 'ovr'
    if multi_class not in ('multinomial', 'ovr'):
        raise ValueError("multi_class should be 'multinomial', 'ovr' or "
                         "'auto'. Got %s." % multi_class)
    if multi_class == 'multinomial' and solver == 'liblinear':
        raise ValueError("Solver %s does not support "
                         "a multinomial backend." % solver)
    return multi_classdef logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
                             max_iter=100, tol=1e-4, verbose=0,
                             solver='lbfgs', coef=None,
                             class_weight=None, dual=False, penalty='l2',
                             intercept_scaling=1., multi_class='warn',
                             random_state=None, check_input=True,
                             max_squared_sum=None, sample_weight=None):
    """Compute a Logistic Regression model for a list of regularization
    parameters.    This is an implementation that uses the result of the previous model
    to speed up computations along the set of solutions, making it faster
    than sequentially calling LogisticRegression for the different parameters.
    Note that there will be no speedup with liblinear solver, since it does
    not handle warm-starting.    Read more in the :ref:`User Guide <logistic_regression>`.    Parameters
    ----------
    X : array-like or sparse matrix, shape (n_samples, n_features)
        Input data.    y : array-like, shape (n_samples,) or (n_samples, n_targets)
        Input data, target values.    pos_class : int, None
        The class with respect to which we perform a one-vs-all fit.
        If None, then it is assumed that the given problem is binary.    Cs : int | array-like, shape (n_cs,)
        List of values for the regularization parameter or integer specifying
        the number of regularization parameters that should be used. In this
        case, the parameters will be chosen in a logarithmic scale between
        1e-4 and 1e4.    fit_intercept : bool
        Whether to fit an intercept for the model. In this case the shape of
        the returned array is (n_cs, n_features + 1).    max_iter : int
        Maximum number of iterations for the solver.    tol : float
        Stopping criterion. For the newton-cg and lbfgs solvers, the iteration
        will stop when ``max{|g_i | i = 1, ..., n} <= tol``
        where ``g_i`` is the i-th component of the gradient.    verbose : int
        For the liblinear and lbfgs solvers set verbose to any positive
        number for verbosity.    solver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}
        Numerical solver to use.    coef : array-like, shape (n_features,), default None
        Initialization value for coefficients of logistic regression.
        Useless for liblinear solver.    class_weight : dict or 'balanced', optional
        Weights associated with classes in the form ``{class_label: weight}``.
        If not given, all classes are supposed to have weight one.        The "balanced" mode uses the values of y to automatically adjust
        weights inversely proportional to class frequencies in the input data
        as ``n_samples / (n_classes * np.bincount(y))``.        Note that these weights will be multiplied with sample_weight (passed
        through the fit method) if sample_weight is specified.    dual : bool
        Dual or primal formulation. Dual formulation is only implemented for
        l2 penalty with liblinear solver. Prefer dual=False when
        n_samples > n_features.    penalty : str, 'l1' or 'l2'
        Used to specify the norm used in the penalization. The 'newton-cg',
        'sag' and 'lbfgs' solvers support only l2 penalties.    intercept_scaling : float, default 1.
        Useful only when the solver 'liblinear' is used
        and self.fit_intercept is set to True. In this case, x becomes
        [x, self.intercept_scaling],
        i.e. a "synthetic" feature with constant value equal to
        intercept_scaling is appended to the instance vector.
        The intercept becomes ``intercept_scaling * synthetic_feature_weight``.        Note! the synthetic feature weight is subject to l1/l2 regularization
        as all other features.
        To lessen the effect of regularization on synthetic feature weight
        (and therefore on the intercept) intercept_scaling has to be increased.    multi_class : str, {'ovr', 'multinomial', 'auto'}, default: 'ovr'
        If the option chosen is 'ovr', then a binary problem is fit for each
        label. For 'multinomial' the loss minimised is the multinomial loss fit
        across the entire probability distribution, *even when the data is
        binary*. 'multinomial' is unavailable when solver='liblinear'.
        'auto' selects 'ovr' if the data is binary, or if solver='liblinear',
        and otherwise selects 'multinomial'.        .. versionadded:: 0.18
           Stochastic Average Gradient descent solver for 'multinomial' case.
        .. versionchanged:: 0.20
            Default will change from 'ovr' to 'auto' in 0.22.    random_state : int, RandomState instance or None, optional, default None
        The seed of the pseudo random number generator to use when shuffling
        the data.  If int, random_state is the seed used by the random number
        generator; If RandomState instance, random_state is the random number
        generator; If None, the random number generator is the RandomState
        instance used by `np.random`. Used when ``solver`` == 'sag' or
        'liblinear'.    check_input : bool, default True
        If False, the input arrays X and y will not be checked.    max_squared_sum : float, default None
        Maximum squared sum of X over samples. Used only in SAG solver.
        If None, it will be computed, going through all the samples.
        The value should be precomputed to speed up cross validation.    sample_weight : array-like, shape(n_samples,) optional
        Array of weights that are assigned to individual samples.
        If not provided, then each sample is given unit weight.    Returns
    -------
    coefs : ndarray, shape (n_cs, n_features) or (n_cs, n_features + 1)
        List of coefficients for the Logistic Regression model. If
        fit_intercept is set to True then the second dimension will be
        n_features + 1, where the last item represents the intercept. For
        ``multiclass='multinomial'``, the shape is (n_classes, n_cs,
        n_features) or (n_classes, n_cs, n_features + 1).    Cs : ndarray
        Grid of Cs used for cross-validation.    n_iter : array, shape (n_cs,)
        Actual number of iteration for each Cs.    Notes
    -----
    You might get slightly different results with the solver liblinear than
    with the others since this uses LIBLINEAR which penalizes the intercept.    .. versionchanged:: 0.19
        The "copy" parameter was removed.
    """
    if isinstance(Cs, numbers.Integral):
        Cs = np.logspace(-4, 4, Cs)    solver = _check_solver(solver, penalty, dual)    # Preprocessing.
    if check_input:
        X = check_array(X, accept_sparse='csr', dtype=np.float64,
                        accept_large_sparse=solver != 'liblinear')
        y = check_array(y, ensure_2d=False, dtype=None)
        check_consistent_length(X, y)
    _, n_features = X.shape    classes = np.unique(y)
    random_state = check_random_state(random_state)    multi_class = _check_multi_class(multi_class, solver, len(classes))
    if pos_class is None and multi_class != 'multinomial':
        if (classes.size > 2):
            raise ValueError('To fit OvR, use the pos_class argument')
        # np.unique(y) gives labels in sorted order.
        pos_class = classes[1]    # If sample weights exist, convert them to array (support for lists)
    # and check length
    # Otherwise set them to 1 for all examples
    if sample_weight is not None:
        sample_weight = np.array(sample_weight, dtype=X.dtype, order='C')
        check_consistent_length(y, sample_weight)
    else:
        sample_weight = np.ones(X.shape[0], dtype=X.dtype)    # If class_weights is a dict (provided by the user), the weights
    # are assigned to the original labels. If it is "balanced", then
    # the class_weights are assigned after masking the labels with a OvR.
    le = LabelEncoder()
    if isinstance(class_weight, dict) or multi_class == 'multinomial':
        class_weight_ = compute_class_weight(class_weight, classes, y)
        sample_weight *= class_weight_[le.fit_transform(y)]    # For doing a ovr, we need to mask the labels first. for the
    # multinomial case this is not necessary.
    if multi_class == 'ovr':
        w0 = np.zeros(n_features + int(fit_intercept), dtype=X.dtype)
        mask_classes = np.array([-1, 1])
        mask = (y == pos_class)
        y_bin = np.ones(y.shape, dtype=X.dtype)
        y_bin[~mask] = -1.
        # for compute_class_weight        if class_weight == "balanced":
            class_weight_ = compute_class_weight(class_weight, mask_classes,
                                                 y_bin)
            sample_weight *= class_weight_[le.fit_transform(y_bin)]    else:
        if solver not in ['sag', 'saga']:
            lbin = LabelBinarizer()
            Y_multi = lbin.fit_transform(y)
            if Y_multi.shape[1] == 1:
                Y_multi = np.hstack([1 - Y_multi, Y_multi])
        else:
            # SAG multinomial solver needs LabelEncoder, not LabelBinarizer
            le = LabelEncoder()
            Y_multi = le.fit_transform(y).astype(X.dtype, copy=False)        w0 = np.zeros((classes.size, n_features + int(fit_intercept)),
                      order='F', dtype=X.dtype)    if coef is not None:
        # it must work both giving the bias term and not
        if multi_class == 'ovr':
            if coef.size not in (n_features, w0.size):
                raise ValueError(
                    'Initialization coef is of shape %d, expected shape '
                    '%d or %d' % (coef.size, n_features, w0.size))
            w0[:coef.size] = coef
        else:
            # For binary problems coef.shape[0] should be 1, otherwise it
            # should be classes.size.
            n_classes = classes.size
            if n_classes == 2:
                n_classes = 1            if (coef.shape[0] != n_classes or
                    coef.shape[1] not in (n_features, n_features + 1)):
                raise ValueError(
                    'Initialization coef is of shape (%d, %d), expected '
                    'shape (%d, %d) or (%d, %d)' % (
                        coef.shape[0], coef.shape[1], classes.size,
                        n_features, classes.size, n_features + 1))            if n_classes == 1:
                w0[0, :coef.shape[1]] = -coef
                w0[1, :coef.shape[1]] = coef
            else:
                w0[:, :coef.shape[1]] = coef    if multi_class == 'multinomial':
        # fmin_l_bfgs_b and newton-cg accepts only ravelled parameters.
        if solver in ['lbfgs', 'newton-cg']:
            w0 = w0.ravel()
        target = Y_multi
        if solver == 'lbfgs':
            func = lambda x, *args: _multinomial_loss_grad(x, *args)[0:2]
        elif solver == 'newton-cg':
            func = lambda x, *args: _multinomial_loss(x, *args)[0]
            grad = lambda x, *args: _multinomial_loss_grad(x, *args)[1]
            hess = _multinomial_grad_hess
        warm_start_sag = {'coef': w0.T}
    else:
        target = y_bin
        if solver == 'lbfgs':
            func = _logistic_loss_and_grad
        elif solver == 'newton-cg':
            func = _logistic_loss
            grad = lambda x, *args: _logistic_loss_and_grad(x, *args)[1]
            hess = _logistic_grad_hess
        warm_start_sag = {'coef': np.expand_dims(w0, axis=1)}    coefs = list()
    n_iter = np.zeros(len(Cs), dtype=np.int32)
    for i, C in enumerate(Cs):
        if solver == 'lbfgs':
            iprint = [-1, 50, 1, 100, 101][
                np.searchsorted(np.array([0, 1, 2, 3]), verbose)]
            w0, loss, info = optimize.fmin_l_bfgs_b(
                func, w0, fprime=None,
                args=(X, target, 1. / C, sample_weight),
                iprint=iprint, pgtol=tol, maxiter=max_iter)
            if info["warnflag"] == 1:
                warnings.warn("lbfgs failed to converge. Increase the number "
                              "of iterations.", ConvergenceWarning)
            # In scipy <= 1.0.0, nit may exceed maxiter.
            # See https://github.com/scipy/scipy/issues/7854.
            n_iter_i = min(info['nit'], max_iter)
        elif solver == 'newton-cg':
            args = (X, target, 1. / C, sample_weight)
            w0, n_iter_i = newton_cg(hess, func, grad, w0, args=args,
                                     maxiter=max_iter, tol=tol)
        elif solver == 'liblinear':
            coef_, intercept_, n_iter_i, = _fit_liblinear(
                X, target, C, fit_intercept, intercept_scaling, None,
                penalty, dual, verbose, max_iter, tol, random_state,
                sample_weight=sample_weight)
            if fit_intercept:
                w0 = np.concatenate([coef_.ravel(), intercept_])
            else:
                w0 = coef_.ravel()        elif solver in ['sag', 'saga']:
            if multi_class == 'multinomial':
                target = target.astype(np.float64)
                loss = 'multinomial'
            else:
                loss = 'log'
            if penalty == 'l1':
                alpha = 0.
                beta = 1. / C
            else:
                alpha = 1. / C
                beta = 0.
            w0, n_iter_i, warm_start_sag = sag_solver(
                X, target, sample_weight, loss, alpha,
                beta, max_iter, tol,
                verbose, random_state, False, max_squared_sum, warm_start_sag,
                is_saga=(solver == 'saga'))        else:
            raise ValueError("solver must be one of {'liblinear', 'lbfgs', "
                             "'newton-cg', 'sag'}, got '%s' instead" % solver)        if multi_class == 'multinomial':
            n_classes = max(2, classes.size)
            multi_w0 = np.reshape(w0, (n_classes, -1))
            if n_classes == 2:
                multi_w0 = multi_w0[1][np.newaxis, :]
            coefs.append(multi_w0.copy())
        else:
            coefs.append(w0.copy())        n_iter[i] = n_iter_i    return np.array(coefs), np.array(Cs), n_iter# helper function for LogisticCV
def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
                          scoring=None, fit_intercept=False,
                          max_iter=100, tol=1e-4, class_weight=None,
                          verbose=0, solver='lbfgs', penalty='l2',
                          dual=False, intercept_scaling=1.,
                          multi_class='warn', random_state=None,
                          max_squared_sum=None, sample_weight=None):
    """Computes scores across logistic_regression_path    Parameters
    ----------
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.    y : array-like, shape (n_samples,) or (n_samples, n_targets)
        Target labels.    train : list of indices
        The indices of the train set.    test : list of indices
        The indices of the test set.    pos_class : int, None
        The class with respect to which we perform a one-vs-all fit.
        If None, then it is assumed that the given problem is binary.    Cs : list of floats | int
        Each of the values in Cs describes the inverse of
        regularization strength. If Cs is as an int, then a grid of Cs
        values are chosen in a logarithmic scale between 1e-4 and 1e4.
        If not provided, then a fixed set of values for Cs are used.    scoring : callable or None, optional, default: None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``. For a list of scoring functions
        that can be used, look at :mod:`sklearn.metrics`. The
        default scoring option used is accuracy_score.    fit_intercept : bool
        If False, then the bias term is set to zero. Else the last
        term of each coef_ gives us the intercept.    max_iter : int
        Maximum number of iterations for the solver.    tol : float
        Tolerance for stopping criteria.    class_weight : dict or 'balanced', optional
        Weights associated with classes in the form ``{class_label: weight}``.
        If not given, all classes are supposed to have weight one.        The "balanced" mode uses the values of y to automatically adjust
        weights inversely proportional to class frequencies in the input data
        as ``n_samples / (n_classes * np.bincount(y))``        Note that these weights will be multiplied with sample_weight (passed
        through the fit method) if sample_weight is specified.    verbose : int
        For the liblinear and lbfgs solvers set verbose to any positive
        number for verbosity.    solver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}
        Decides which solver to use.    penalty : str, 'l1' or 'l2'
        Used to specify the norm used in the penalization. The 'newton-cg',
        'sag' and 'lbfgs' solvers support only l2 penalties.    dual : bool
        Dual or primal formulation. Dual formulation is only implemented for
        l2 penalty with liblinear solver. Prefer dual=False when
        n_samples > n_features.    intercept_scaling : float, default 1.
        Useful only when the solver 'liblinear' is used
        and self.fit_intercept is set to True. In this case, x becomes
        [x, self.intercept_scaling],
        i.e. a "synthetic" feature with constant value equals to
        intercept_scaling is appended to the instance vector.
        The intercept becomes intercept_scaling * synthetic feature weight
        Note! the synthetic feature weight is subject to l1/l2 regularization
        as all other features.
        To lessen the effect of regularization on synthetic feature weight
        (and therefore on the intercept) intercept_scaling has to be increased.    multi_class : str, {'ovr', 'multinomial'}
        If the option chosen is 'ovr', then a binary problem is fit for each
        label. For 'multinomial' the loss minimised is the multinomial loss fit
        across the entire probability distribution, *even when the data is
        binary*. 'multinomial' is unavailable when solver='liblinear'.    random_state : int, RandomState instance or None, optional, default None
        The seed of the pseudo random number generator to use when shuffling
        the data.  If int, random_state is the seed used by the random number
        generator; If RandomState instance, random_state is the random number
        generator; If None, the random number generator is the RandomState
        instance used by `np.random`. Used when ``solver`` == 'sag' and
        'liblinear'.    max_squared_sum : float, default None
        Maximum squared sum of X over samples. Used only in SAG solver.
        If None, it will be computed, going through all the samples.
        The value should be precomputed to speed up cross validation.    sample_weight : array-like, shape(n_samples,) optional
        Array of weights that are assigned to individual samples.
        If not provided, then each sample is given unit weight.    Returns
    -------
    coefs : ndarray, shape (n_cs, n_features) or (n_cs, n_features + 1)
        List of coefficients for the Logistic Regression model. If
        fit_intercept is set to True then the second dimension will be
        n_features + 1, where the last item represents the intercept.    Cs : ndarray
        Grid of Cs used for cross-validation.    scores : ndarray, shape (n_cs,)
        Scores obtained for each Cs.    n_iter : array, shape(n_cs,)
        Actual number of iteration for each Cs.
    """
    X_train = X[train]
    X_test = X[test]
    y_train = y[train]
    y_test = y[test]    if sample_weight is not None:
        sample_weight = check_array(sample_weight, ensure_2d=False)
        check_consistent_length(y, sample_weight)        sample_weight = sample_weight[train]    coefs, Cs, n_iter = logistic_regression_path(
        X_train, y_train, Cs=Cs, fit_intercept=fit_intercept,
        solver=solver, max_iter=max_iter, class_weight=class_weight,
        pos_class=pos_class, multi_class=multi_class,
        tol=tol, verbose=verbose, dual=dual, penalty=penalty,
        intercept_scaling=intercept_scaling, random_state=random_state,
        check_input=False, max_squared_sum=max_squared_sum,
        sample_weight=sample_weight)    log_reg = LogisticRegression(solver=solver, multi_class=multi_class)    # The score method of Logistic Regression has a classes_ attribute.
    if multi_class == 'ovr':
        log_reg.classes_ = np.array([-1, 1])
    elif multi_class == 'multinomial':
        log_reg.classes_ = np.unique(y_train)
    else:
        raise ValueError("multi_class should be either multinomial or ovr, "
                         "got %d" % multi_class)    if pos_class is not None:
        mask = (y_test == pos_class)
        y_test = np.ones(y_test.shape, dtype=np.float64)
        y_test[~mask] = -1.    scores = list()    if isinstance(scoring, six.string_types):
        scoring = get_scorer(scoring)
    for w in coefs:
        if multi_class == 'ovr':
            w = w[np.newaxis, :]
        if fit_intercept:
            log_reg.coef_ = w[:, :-1]
            log_reg.intercept_ = w[:, -1]
        else:
            log_reg.coef_ = w
            log_reg.intercept_ = 0.        if scoring is None:
            scores.append(log_reg.score(X_test, y_test))
        else:
            scores.append(scoring(log_reg, X_test, y_test))
    return coefs, Cs, np.array(scores), n_iterclass LogisticRegression(BaseEstimator, LinearClassifierMixin,
                         SparseCoefMixin):
    """Logistic Regression (aka logit, MaxEnt) classifier.    In the multiclass case, the training algorithm uses the one-vs-rest (OvR)
    scheme if the 'multi_class' option is set to 'ovr', and uses the cross-
    entropy loss if the 'multi_class' option is set to 'multinomial'.
    (Currently the 'multinomial' option is supported only by the 'lbfgs',
    'sag' and 'newton-cg' solvers.)    This class implements regularized logistic regression using the
    'liblinear' library, 'newton-cg', 'sag' and 'lbfgs' solvers. It can handle
    both dense and sparse input. Use C-ordered arrays or CSR matrices
    containing 64-bit floats for optimal performance; any other input format
    will be converted (and copied).    The 'newton-cg', 'sag', and 'lbfgs' solvers support only L2 regularization
    with primal formulation. The 'liblinear' solver supports both L1 and L2
    regularization, with a dual formulation only for the L2 penalty.    Read more in the :ref:`User Guide <logistic_regression>`.    Parameters
    ----------
    penalty : str, 'l1' or 'l2', default: 'l2'
        Used to specify the norm used in the penalization. The 'newton-cg',
        'sag' and 'lbfgs' solvers support only l2 penalties.        .. versionadded:: 0.19
           l1 penalty with SAGA solver (allowing 'multinomial' + L1)    dual : bool, default: False
        Dual or primal formulation. Dual formulation is only implemented for
        l2 penalty with liblinear solver. Prefer dual=False when
        n_samples > n_features.    tol : float, default: 1e-4
        Tolerance for stopping criteria.    C : float, default: 1.0
        Inverse of regularization strength; must be a positive float.
        Like in support vector machines, smaller values specify stronger
        regularization.    fit_intercept : bool, default: True
        Specifies if a constant (a.k.a. bias or intercept) should be
        added to the decision function.    intercept_scaling : float, default 1.
        Useful only when the solver 'liblinear' is used
        and self.fit_intercept is set to True. In this case, x becomes
        [x, self.intercept_scaling],
        i.e. a "synthetic" feature with constant value equal to
        intercept_scaling is appended to the instance vector.
        The intercept becomes ``intercept_scaling * synthetic_feature_weight``.        Note! the synthetic feature weight is subject to l1/l2 regularization
        as all other features.
        To lessen the effect of regularization on synthetic feature weight
        (and therefore on the intercept) intercept_scaling has to be increased.    class_weight : dict or 'balanced', default: None
        Weights associated with classes in the form ``{class_label: weight}``.
        If not given, all classes are supposed to have weight one.        The "balanced" mode uses the values of y to automatically adjust
        weights inversely proportional to class frequencies in the input data
        as ``n_samples / (n_classes * np.bincount(y))``.        Note that these weights will be multiplied with sample_weight (passed
        through the fit method) if sample_weight is specified.        .. versionadded:: 0.17
           *class_weight='balanced'*    random_state : int, RandomState instance or None, optional, default: None
        The seed of the pseudo random number generator to use when shuffling
        the data.  If int, random_state is the seed used by the random number
        generator; If RandomState instance, random_state is the random number
        generator; If None, the random number generator is the RandomState
        instance used by `np.random`. Used when ``solver`` == 'sag' or
        'liblinear'.    solver : str, {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \
             default: 'liblinear'.        Algorithm to use in the optimization problem.        - For small datasets, 'liblinear' is a good choice, whereas 'sag' and
          'saga' are faster for large ones.
        - For multiclass problems, only 'newton-cg', 'sag', 'saga' and 'lbfgs'
          handle multinomial loss; 'liblinear' is limited to one-versus-rest
          schemes.
        - 'newton-cg', 'lbfgs' and 'sag' only handle L2 penalty, whereas
          'liblinear' and 'saga' handle L1 penalty.        Note that 'sag' and 'saga' fast convergence is only guaranteed on
        features with approximately the same scale. You can
        preprocess the data with a scaler from sklearn.preprocessing.        .. versionadded:: 0.17
           Stochastic Average Gradient descent solver.
        .. versionadded:: 0.19
           SAGA solver.
        .. versionchanged:: 0.20
            Default will change from 'liblinear' to 'lbfgs' in 0.22.    max_iter : int, default: 100
        Useful only for the newton-cg, sag and lbfgs solvers.
        Maximum number of iterations taken for the solvers to converge.    multi_class : str, {'ovr', 'multinomial', 'auto'}, default: 'ovr'
        If the option chosen is 'ovr', then a binary problem is fit for each
        label. For 'multinomial' the loss minimised is the multinomial loss fit
        across the entire probability distribution, *even when the data is
        binary*. 'multinomial' is unavailable when solver='liblinear'.
        'auto' selects 'ovr' if the data is binary, or if solver='liblinear',
        and otherwise selects 'multinomial'.        .. versionadded:: 0.18
           Stochastic Average Gradient descent solver for 'multinomial' case.
        .. versionchanged:: 0.20
            Default will change from 'ovr' to 'auto' in 0.22.    verbose : int, default: 0
        For the liblinear and lbfgs solvers set verbose to any positive
        number for verbosity.    warm_start : bool, default: False
        When set to True, reuse the solution of the previous call to fit as
        initialization, otherwise, just erase the previous solution.
        Useless for liblinear solver. See :term:`the Glossary <warm_start>`.        .. versionadded:: 0.17
           *warm_start* to support *lbfgs*, *newton-cg*, *sag*, *saga* solvers.    n_jobs : int or None, optional (default=None)
        Number of CPU cores used when parallelizing over classes if
        multi_class='ovr'". This parameter is ignored when the ``solver`` is
        set to 'liblinear' regardless of whether 'multi_class' is specified or
        not. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
        context. ``-1`` means using all processors.
        See :term:`Glossary <n_jobs>` for more details.    Attributes
    ----------    coef_ : array, shape (1, n_features) or (n_classes, n_features)
        Coefficient of the features in the decision function.        `coef_` is of shape (1, n_features) when the given problem is binary.
        In particular, when `multi_class='multinomial'`, `coef_` corresponds
        to outcome 1 (True) and `-coef_` corresponds to outcome 0 (False).    intercept_ : array, shape (1,) or (n_classes,)
        Intercept (a.k.a. bias) added to the decision function.        If `fit_intercept` is set to False, the intercept is set to zero.
        `intercept_` is of shape (1,) when the given problem is binary.
        In particular, when `multi_class='multinomial'`, `intercept_`
        corresponds to outcome 1 (True) and `-intercept_` corresponds to
        outcome 0 (False).    n_iter_ : array, shape (n_classes,) or (1, )
        Actual number of iterations for all classes. If binary or multinomial,
        it returns only 1 element. For liblinear solver, only the maximum
        number of iteration across all classes is given.        .. versionchanged:: 0.20            In SciPy <= 1.0.0 the number of lbfgs iterations may exceed
            ``max_iter``. ``n_iter_`` will now report at most ``max_iter``.    Examples
    --------
    >>> from sklearn.datasets import load_iris
    >>> from sklearn.linear_model import LogisticRegression
    >>> X, y = load_iris(return_X_y=True)
    >>> clf = LogisticRegression(random_state=0, solver='lbfgs',
    ...                          multi_class='multinomial').fit(X, y)
    >>> clf.predict(X[:2, :])
    array([0, 0])
    >>> clf.predict_proba(X[:2, :]) # doctest: +ELLIPSIS
    array([[9.8...e-01, 1.8...e-02, 1.4...e-08],
           [9.7...e-01, 2.8...e-02, ...e-08]])
    >>> clf.score(X, y)
    0.97...    See also
    --------
    SGDClassifier : incrementally trained logistic regression (when given
        the parameter ``loss="log"``).
    LogisticRegressionCV : Logistic regression with built-in cross validation    Notes
    -----
    The underlying C implementation uses a random number generator to
    select features when fitting the model. It is thus not uncommon,
    to have slightly different results for the same input data. If
    that happens, try with a smaller tol parameter.    Predict output may not match that of standalone liblinear in certain
    cases. See :ref:`differences from liblinear <liblinear_differences>`
    in the narrative documentation.    References
    ----------    LIBLINEAR -- A Library for Large Linear Classification
        http://www.csie.ntu.edu.tw/~cjlin/liblinear/    SAG -- Mark Schmidt, Nicolas Le Roux, and Francis Bach
        Minimizing Finite Sums with the Stochastic Average Gradient
        https://hal.inria.fr/hal-00860051/document    SAGA -- Defazio, A., Bach F. & Lacoste-Julien S. (2014).
        SAGA: A Fast Incremental Gradient Method With Support
        for Non-Strongly Convex Composite Objectives
        https://arxiv.org/abs/1407.0202    Hsiang-Fu Yu, Fang-Lan Huang, Chih-Jen Lin (2011). Dual coordinate descent
        methods for logistic regression and maximum entropy models.
        Machine Learning 85(1-2):41-75.
        http://www.csie.ntu.edu.tw/~cjlin/papers/maxent_dual.pdf
    """    def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0,
                 fit_intercept=True, intercept_scaling=1, class_weight=None,
                 random_state=None, solver='warn', max_iter=100,
                 multi_class='warn', verbose=0, warm_start=False, n_jobs=None):        self.penalty = penalty
        self.dual = dual
        self.tol = tol
        self.C = C
        self.fit_intercept = fit_intercept
        self.intercept_scaling = intercept_scaling
        self.class_weight = class_weight
        self.random_state = random_state
        self.solver = solver
        self.max_iter = max_iter
        self.multi_class = multi_class
        self.verbose = verbose
        self.warm_start = warm_start
        self.n_jobs = n_jobs    def fit(self, X, y, sample_weight=None):
        """Fit the model according to the given training data.        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.        y : array-like, shape (n_samples,) or (n_samples, n_targets)
            Target vector relative to X.        sample_weight : array-like, shape (n_samples,) optional
            Array of weights that are assigned to individual samples.
            If not provided, then each sample is given unit weight.            .. versionadded:: 0.17
               *sample_weight* support to LogisticRegression.        Returns
        -------
        self : object
        """
        if not isinstance(self.C, numbers.Number) or self.C < 0:
            raise ValueError("Penalty term must be positive; got (C=%r)"
                             % self.C)
        if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
            raise ValueError("Maximum number of iteration must be positive;"
                             " got (max_iter=%r)" % self.max_iter)
        if not isinstance(self.tol, numbers.Number) or self.tol < 0:
            raise ValueError("Tolerance for stopping criteria must be "
                             "positive; got (tol=%r)" % self.tol)        solver = _check_solver(self.solver, self.penalty, self.dual)        if solver in ['newton-cg']:
            _dtype = [np.float64, np.float32]
        else:
            _dtype = np.float64        X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
                         accept_large_sparse=solver != 'liblinear')
        check_classification_targets(y)
        self.classes_ = np.unique(y)
        n_samples, n_features = X.shape        multi_class = _check_multi_class(self.multi_class, solver,
                                         len(self.classes_))        if solver == 'liblinear':
            if effective_n_jobs(self.n_jobs) != 1:
                warnings.warn("'n_jobs' > 1 does not have any effect when"
                              " 'solver' is set to 'liblinear'. Got 'n_jobs'"
                              " = {}.".format(effective_n_jobs(self.n_jobs)))
            self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
                X, y, self.C, self.fit_intercept, self.intercept_scaling,
                self.class_weight, self.penalty, self.dual, self.verbose,
                self.max_iter, self.tol, self.random_state,
                sample_weight=sample_weight)
            self.n_iter_ = np.array([n_iter_])
            return self        if solver in ['sag', 'saga']:
            max_squared_sum = row_norms(X, squared=True).max()
        else:
            max_squared_sum = None        n_classes = len(self.classes_)
        classes_ = self.classes_
        if n_classes < 2:
            raise ValueError("This solver needs samples of at least 2 classes"
                             " in the data, but the data contains only one"
                             " class: %r" % classes_[0])        if len(self.classes_) == 2:
            n_classes = 1
            classes_ = classes_[1:]        if self.warm_start:
            warm_start_coef = getattr(self, 'coef_', None)
        else:
            warm_start_coef = None
        if warm_start_coef is not None and self.fit_intercept:
            warm_start_coef = np.append(warm_start_coef,
                                        self.intercept_[:, np.newaxis],
                                        axis=1)        self.coef_ = list()
        self.intercept_ = np.zeros(n_classes)        # Hack so that we iterate only once for the multinomial case.
        if multi_class == 'multinomial':
            classes_ = [None]
            warm_start_coef = [warm_start_coef]
        if warm_start_coef is None:
            warm_start_coef = [None] * n_classes        path_func = delayed(logistic_regression_path)        # The SAG solver releases the GIL so it's more efficient to use
        # threads for this solver.
        if solver in ['sag', 'saga']:
            prefer = 'threads'
        else:
            prefer = 'processes'
        fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                               prefer=prefer)(
            path_func(X, y, pos_class=class_, Cs=[self.C],
                      fit_intercept=self.fit_intercept, tol=self.tol,
                      verbose=self.verbose, solver=solver,
                      multi_class=multi_class, max_iter=self.max_iter,
                      class_weight=self.class_weight, check_input=False,
                      random_state=self.random_state, coef=warm_start_coef_,
                      penalty=self.penalty,
                      max_squared_sum=max_squared_sum,
                      sample_weight=sample_weight)
            for class_, warm_start_coef_ in zip(classes_, warm_start_coef))        fold_coefs_, _, n_iter_ = zip(*fold_coefs_)
        self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0]        if multi_class == 'multinomial':
            self.coef_ = fold_coefs_[0][0]
        else:
            self.coef_ = np.asarray(fold_coefs_)
            self.coef_ = self.coef_.reshape(n_classes, n_features +
                                            int(self.fit_intercept))        if self.fit_intercept:
            self.intercept_ = self.coef_[:, -1]
            self.coef_ = self.coef_[:, :-1]        return self    def predict_proba(self, X):
        """Probability estimates.        The returned estimates for all classes are ordered by the
        label of classes.        For a multi_class problem, if multi_class is set to be "multinomial"
        the softmax function is used to find the predicted probability of
        each class.
        Else use a one-vs-rest approach, i.e calculate the probability
        of each class assuming it to be positive using the logistic function.
        and normalize these values across all the classes.        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]        Returns
        -------
        T : array-like, shape = [n_samples, n_classes]
            Returns the probability of the sample for each class in the model,
            where classes are ordered as they are in ``self.classes_``.
        """
        if not hasattr(self, "coef_"):
            raise NotFittedError("Call fit before prediction")        ovr = (self.multi_class in ["ovr", "warn"] or
               (self.multi_class == 'auto' and (self.classes_.size <= 2 or
                                                self.solver == 'liblinear')))
        if ovr:
            return super(LogisticRegression, self)._predict_proba_lr(X)
        else:
            decision = self.decision_function(X)
            if decision.ndim == 1:
                # Workaround for multi_class="multinomial" and binary outcomes
                # which requires softmax prediction with only a 1D decision.
                decision_2d = np.c_[-decision, decision]
            else:
                decision_2d = decision
            return softmax(decision_2d, copy=False)    def predict_log_proba(self, X):
        """Log of probability estimates.        The returned estimates for all classes are ordered by the
        label of classes.        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]        Returns
        -------
        T : array-like, shape = [n_samples, n_classes]
            Returns the log-probability of the sample for each class in the
            model, where classes are ordered as they are in ``self.classes_``.
        """
        return np.log(self.predict_proba(X))class LogisticRegressionCV(LogisticRegression, BaseEstimator,
                           LinearClassifierMixin):
    """Logistic Regression CV (aka logit, MaxEnt) classifier.    This class implements logistic regression using liblinear, newton-cg, sag
    of lbfgs optimizer. The newton-cg, sag and lbfgs solvers support only L2
    regularization with primal formulation. The liblinear solver supports both
    L1 and L2 regularization, with a dual formulation only for the L2 penalty.    For the grid of Cs values (that are set by default to be ten values in
    a logarithmic scale between 1e-4 and 1e4), the best hyperparameter is
    selected by the cross-validator StratifiedKFold, but it can be changed
    using the cv parameter. In the case of newton-cg and lbfgs solvers,
    we warm start along the path i.e guess the initial coefficients of the
    present fit to be the coefficients got after convergence in the previous
    fit, so it is supposed to be faster for high-dimensional dense data.    For a multiclass problem, the hyperparameters for each class are computed
    using the best scores got by doing a one-vs-rest in parallel across all
    folds and classes. Hence this is not the true multinomial loss.    Read more in the :ref:`User Guide <logistic_regression>`.    Parameters
    ----------
    Cs : list of floats | int
        Each of the values in Cs describes the inverse of regularization
        strength. If Cs is as an int, then a grid of Cs values are chosen
        in a logarithmic scale between 1e-4 and 1e4.
        Like in support vector machines, smaller values specify stronger
        regularization.    fit_intercept : bool, default: True
        Specifies if a constant (a.k.a. bias or intercept) should be
        added to the decision function.    cv : integer or cross-validation generator, default: None
        The default cross-validation generator used is Stratified K-Folds.
        If an integer is provided, then it is the number of folds used.
        See the module :mod:`sklearn.model_selection` module for the
        list of possible cross-validation objects.        .. versionchanged:: 0.20
            ``cv`` default value if None will change from 3-fold to 5-fold
            in v0.22.    dual : bool
        Dual or primal formulation. Dual formulation is only implemented for
        l2 penalty with liblinear solver. Prefer dual=False when
        n_samples > n_features.    penalty : str, 'l1' or 'l2'
        Used to specify the norm used in the penalization. The 'newton-cg',
        'sag' and 'lbfgs' solvers support only l2 penalties.    scoring : string, callable, or None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``. For a list of scoring functions
        that can be used, look at :mod:`sklearn.metrics`. The
        default scoring option used is 'accuracy'.    solver : str, {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \
             default: 'lbfgs'.        Algorithm to use in the optimization problem.        - For small datasets, 'liblinear' is a good choice, whereas 'sag' and
          'saga' are faster for large ones.
        - For multiclass problems, only 'newton-cg', 'sag', 'saga' and 'lbfgs'
          handle multinomial loss; 'liblinear' is limited to one-versus-rest
          schemes.
        - 'newton-cg', 'lbfgs' and 'sag' only handle L2 penalty, whereas
          'liblinear' and 'saga' handle L1 penalty.
        - 'liblinear' might be slower in LogisticRegressionCV because it does
          not handle warm-starting.        Note that 'sag' and 'saga' fast convergence is only guaranteed on
        features with approximately the same scale. You can preprocess the data
        with a scaler from sklearn.preprocessing.        .. versionadded:: 0.17
           Stochastic Average Gradient descent solver.
        .. versionadded:: 0.19
           SAGA solver.    tol : float, optional
        Tolerance for stopping criteria.    max_iter : int, optional
        Maximum number of iterations of the optimization algorithm.    class_weight : dict or 'balanced', optional
        Weights associated with classes in the form ``{class_label: weight}``.
        If not given, all classes are supposed to have weight one.        The "balanced" mode uses the values of y to automatically adjust
        weights inversely proportional to class frequencies in the input data
        as ``n_samples / (n_classes * np.bincount(y))``.        Note that these weights will be multiplied with sample_weight (passed
        through the fit method) if sample_weight is specified.        .. versionadded:: 0.17
           class_weight == 'balanced'    n_jobs : int or None, optional (default=None)
        Number of CPU cores used during the cross-validation loop.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.    verbose : int
        For the 'liblinear', 'sag' and 'lbfgs' solvers set verbose to any
        positive number for verbosity.    refit : bool
        If set to True, the scores are averaged across all folds, and the
        coefs and the C that corresponds to the best score is taken, and a
        final refit is done using these parameters.
        Otherwise the coefs, intercepts and C that correspond to the
        best scores across folds are averaged.    intercept_scaling : float, default 1.
        Useful only when the solver 'liblinear' is used
        and self.fit_intercept is set to True. In this case, x becomes
        [x, self.intercept_scaling],
        i.e. a "synthetic" feature with constant value equal to
        intercept_scaling is appended to the instance vector.
        The intercept becomes ``intercept_scaling * synthetic_feature_weight``.        Note! the synthetic feature weight is subject to l1/l2 regularization
        as all other features.
        To lessen the effect of regularization on synthetic feature weight
        (and therefore on the intercept) intercept_scaling has to be increased.    multi_class : str, {'ovr', 'multinomial', 'auto'}, default: 'ovr'
        If the option chosen is 'ovr', then a binary problem is fit for each
        label. For 'multinomial' the loss minimised is the multinomial loss fit
        across the entire probability distribution, *even when the data is
        binary*. 'multinomial' is unavailable when solver='liblinear'.
        'auto' selects 'ovr' if the data is binary, or if solver='liblinear',
        and otherwise selects 'multinomial'.        .. versionadded:: 0.18
           Stochastic Average Gradient descent solver for 'multinomial' case.
        .. versionchanged:: 0.20
            Default will change from 'ovr' to 'auto' in 0.22.    random_state : int, RandomState instance or None, optional, default None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.    Attributes
    ----------
    coef_ : array, shape (1, n_features) or (n_classes, n_features)
        Coefficient of the features in the decision function.        `coef_` is of shape (1, n_features) when the given problem
        is binary.    intercept_ : array, shape (1,) or (n_classes,)
        Intercept (a.k.a. bias) added to the decision function.        If `fit_intercept` is set to False, the intercept is set to zero.
        `intercept_` is of shape(1,) when the problem is binary.    Cs_ : array
        Array of C i.e. inverse of regularization parameter values used
        for cross-validation.    coefs_paths_ : array, shape ``(n_folds, len(Cs_), n_features)`` or \
                   ``(n_folds, len(Cs_), n_features + 1)``
        dict with classes as the keys, and the path of coefficients obtained
        during cross-validating across each fold and then across each Cs
        after doing an OvR for the corresponding class as values.
        If the 'multi_class' option is set to 'multinomial', then
        the coefs_paths are the coefficients corresponding to each class.
        Each dict value has shape ``(n_folds, len(Cs_), n_features)`` or
        ``(n_folds, len(Cs_), n_features + 1)`` depending on whether the
        intercept is fit or not.    scores_ : dict
        dict with classes as the keys, and the values as the
        grid of scores obtained during cross-validating each fold, after doing
        an OvR for the corresponding class. If the 'multi_class' option
        given is 'multinomial' then the same scores are repeated across
        all classes, since this is the multinomial class.
        Each dict value has shape (n_folds, len(Cs))    C_ : array, shape (n_classes,) or (n_classes - 1,)
        Array of C that maps to the best scores across every class. If refit is
        set to False, then for each class, the best C is the average of the
        C's that correspond to the best scores for each fold.
        `C_` is of shape(n_classes,) when the problem is binary.    n_iter_ : array, shape (n_classes, n_folds, n_cs) or (1, n_folds, n_cs)
        Actual number of iterations for all classes, folds and Cs.
        In the binary or multinomial cases, the first dimension is equal to 1.    Examples
    --------
    >>> from sklearn.datasets import load_iris
    >>> from sklearn.linear_model import LogisticRegressionCV
    >>> X, y = load_iris(return_X_y=True)
    >>> clf = LogisticRegressionCV(cv=5, random_state=0,
    ...                            multi_class='multinomial').fit(X, y)
    >>> clf.predict(X[:2, :])
    array([0, 0])
    >>> clf.predict_proba(X[:2, :]).shape
    (2, 3)
    >>> clf.score(X, y) # doctest: +ELLIPSIS
    0.98...    See also
    --------
    LogisticRegression    """
    def __init__(self, Cs=10, fit_intercept=True, cv='warn', dual=False,
                 penalty='l2', scoring=None, solver='lbfgs', tol=1e-4,
                 max_iter=100, class_weight=None, n_jobs=None, verbose=0,
                 refit=True, intercept_scaling=1., multi_class='warn',
                 random_state=None):
        self.Cs = Cs
        self.fit_intercept = fit_intercept
        self.cv = cv
        self.dual = dual
        self.penalty = penalty
        self.scoring = scoring
        self.tol = tol
        self.max_iter = max_iter
        self.class_weight = class_weight
        self.n_jobs = n_jobs
        self.verbose = verbose
        self.solver = solver
        self.refit = refit
        self.intercept_scaling = intercept_scaling
        self.multi_class = multi_class
        self.random_state = random_state    def fit(self, X, y, sample_weight=None):
        """Fit the model according to the given training data.        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.        y : array-like, shape (n_samples,) or (n_samples, n_targets)
            Target vector relative to X.        sample_weight : array-like, shape (n_samples,) optional
            Array of weights that are assigned to individual samples.
            If not provided, then each sample is given unit weight.        Returns
        -------
        self : object
        """
        solver = _check_solver(self.solver, self.penalty, self.dual)        if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
            raise ValueError("Maximum number of iteration must be positive;"
                             " got (max_iter=%r)" % self.max_iter)
        if not isinstance(self.tol, numbers.Number) or self.tol < 0:
            raise ValueError("Tolerance for stopping criteria must be "
                             "positive; got (tol=%r)" % self.tol)        X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
                         order="C",
                         accept_large_sparse=solver != 'liblinear')
        check_classification_targets(y)        class_weight = self.class_weight        # Encode for string labels
        label_encoder = LabelEncoder().fit(y)
        y = label_encoder.transform(y)
        if isinstance(class_weight, dict):
            class_weight = dict((label_encoder.transform([cls])[0], v)
                                for cls, v in class_weight.items())        # The original class labels
        classes = self.classes_ = label_encoder.classes_
        encoded_labels = label_encoder.transform(label_encoder.classes_)        multi_class = _check_multi_class(self.multi_class, solver,
                                         len(classes))        if solver in ['sag', 'saga']:
            max_squared_sum = row_norms(X, squared=True).max()
        else:
            max_squared_sum = None        # init cross-validation generator
        cv = check_cv(self.cv, y, classifier=True)
        folds = list(cv.split(X, y))        # Use the label encoded classes
        n_classes = len(encoded_labels)        if n_classes < 2:
            raise ValueError("This solver needs samples of at least 2 classes"
                             " in the data, but the data contains only one"
                             " class: %r" % classes[0])        if n_classes == 2:
            # OvR in case of binary problems is as good as fitting
            # the higher label
            n_classes = 1
            encoded_labels = encoded_labels[1:]
            classes = classes[1:]        # We need this hack to iterate only once over labels, in the case of
        # multi_class = multinomial, without changing the value of the labels.
        if multi_class == 'multinomial':
            iter_encoded_labels = iter_classes = [None]
        else:
            iter_encoded_labels = encoded_labels
            iter_classes = classes        # compute the class weights for the entire dataset y
        if class_weight == "balanced":
            class_weight = compute_class_weight(class_weight,
                                                np.arange(len(self.classes_)),
                                                y)
            class_weight = dict(enumerate(class_weight))        path_func = delayed(_log_reg_scoring_path)        # The SAG solver releases the GIL so it's more efficient to use
        # threads for this solver.
        if self.solver in ['sag', 'saga']:
            prefer = 'threads'
        else:
            prefer = 'processes'
        fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                               prefer=prefer)(
            path_func(X, y, train, test, pos_class=label, Cs=self.Cs,
                      fit_intercept=self.fit_intercept, penalty=self.penalty,
                      dual=self.dual, solver=solver, tol=self.tol,
                      max_iter=self.max_iter, verbose=self.verbose,
                      class_weight=class_weight, scoring=self.scoring,
                      multi_class=multi_class,
                      intercept_scaling=self.intercept_scaling,
                      random_state=self.random_state,
                      max_squared_sum=max_squared_sum,
                      sample_weight=sample_weight
                      )
            for label in iter_encoded_labels
            for train, test in folds)        if multi_class == 'multinomial':
            multi_coefs_paths, Cs, multi_scores, n_iter_ = zip(*fold_coefs_)
            multi_coefs_paths = np.asarray(multi_coefs_paths)
            multi_scores = np.asarray(multi_scores)            # This is just to maintain API similarity between the ovr and
            # multinomial option.
            # Coefs_paths in now n_folds X len(Cs) X n_classes X n_features
            # we need it to be n_classes X len(Cs) X n_folds X n_features
            # to be similar to "ovr".
            coefs_paths = np.rollaxis(multi_coefs_paths, 2, 0)            # Multinomial has a true score across all labels. Hence the
            # shape is n_folds X len(Cs). We need to repeat this score
            # across all labels for API similarity.
            scores = np.tile(multi_scores, (n_classes, 1, 1))
            self.Cs_ = Cs[0]
            self.n_iter_ = np.reshape(n_iter_, (1, len(folds),
                                                len(self.Cs_)))        else:
            coefs_paths, Cs, scores, n_iter_ = zip(*fold_coefs_)
            self.Cs_ = Cs[0]
            coefs_paths = np.reshape(coefs_paths, (n_classes, len(folds),
                                                   len(self.Cs_), -1))
            self.n_iter_ = np.reshape(n_iter_, (n_classes, len(folds),
                                                len(self.Cs_)))        self.coefs_paths_ = dict(zip(classes, coefs_paths))
        scores = np.reshape(scores, (n_classes, len(folds), -1))
        self.scores_ = dict(zip(classes, scores))        self.C_ = list()
        self.coef_ = np.empty((n_classes, X.shape[1]))
        self.intercept_ = np.zeros(n_classes)        # hack to iterate only once for multinomial case.
        if multi_class == 'multinomial':
            scores = multi_scores
            coefs_paths = multi_coefs_paths        for index, (cls, encoded_label) in enumerate(
                zip(iter_classes, iter_encoded_labels)):            if multi_class == 'ovr':
                # The scores_ / coefs_paths_ dict have unencoded class
                # labels as their keys
                scores = self.scores_[cls]
                coefs_paths = self.coefs_paths_[cls]            if self.refit:
                best_index = scores.sum(axis=0).argmax()                C_ = self.Cs_[best_index]
                self.C_.append(C_)
                if multi_class == 'multinomial':
                    coef_init = np.mean(coefs_paths[:, best_index, :, :],
                                        axis=0)
                else:
                    coef_init = np.mean(coefs_paths[:, best_index, :], axis=0)                # Note that y is label encoded and hence pos_class must be
                # the encoded label / None (for 'multinomial')
                w, _, _ = logistic_regression_path(
                    X, y, pos_class=encoded_label, Cs=[C_], solver=solver,
                    fit_intercept=self.fit_intercept, coef=coef_init,
                    max_iter=self.max_iter, tol=self.tol,
                    penalty=self.penalty,
                    class_weight=class_weight,
                    multi_class=multi_class,
                    verbose=max(0, self.verbose - 1),
                    random_state=self.random_state,
                    check_input=False, max_squared_sum=max_squared_sum,
                    sample_weight=sample_weight)
                w = w[0]            else:
                # Take the best scores across every fold and the average of all
                # coefficients corresponding to the best scores.
                best_indices = np.argmax(scores, axis=1)
                w = np.mean([coefs_paths[i][best_indices[i]]
                             for i in range(len(folds))], axis=0)
                self.C_.append(np.mean(self.Cs_[best_indices]))            if multi_class == 'multinomial':
                self.C_ = np.tile(self.C_, n_classes)
                self.coef_ = w[:, :X.shape[1]]
                if self.fit_intercept:
                    self.intercept_ = w[:, -1]
            else:
                self.coef_[index] = w[: X.shape[1]]
                if self.fit_intercept:
                    self.intercept_[index] = w[-1]        self.C_ = np.asarray(self.C_)
        return self    def score(self, X, y, sample_weight=None):
        """Returns the score using the `scoring` option on the given
        test data and labels.        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Test samples.        y : array-like, shape = (n_samples,)
            True labels for X.        sample_weight : array-like, shape = [n_samples], optional
            Sample weights.        Returns
        -------
        score : float
            Score of self.predict(X) wrt. y.        """        if self.scoring is not None:
            warnings.warn("The long-standing behavior to use the "
                          "accuracy score has changed. The scoring "
                          "parameter is now used. "
                          "This warning will disappear in version 0.22.",
                          ChangedBehaviorWarning)
        scoring = self.scoring or 'accuracy'
        if isinstance(scoring, six.string_types):
            scoring = get_scorer(scoring)        return scoring(self, X, y, sample_weight=sample_weight)

https://study.163.com/provider/400000000398149/index.htm?share=2&shareId=400000000398149（欢迎关注博主主页，学习python视频资源，还有大量免费python经典文章）

逻辑回归原理_挑战者飞船事故和乳腺癌案例_Python和R_信用评分卡（AAA推荐）

1 Logistic回归

用最大似然估计求逻辑回归参数

前言

什么是逻辑回归

逻辑回归的代价函数

利用梯度下降法求参数

结束语

参考文献

个人收藏笔记记录

1 Logistic回归

用最大似然估计求逻辑回归参数

前言

什么是逻辑回归

逻辑回归的代价函数

利用梯度下降法求参数

结束语

参考文献

个人收藏笔记记录

开通VIP