U
    9%eڞ                     @   sh  d Z ddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZmZmZmZmZmZmZmZmZ dd	lmZmZmZmZmZmZ G d
d dZG dd deZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%G dd deZ&G dd deZ'G d d! d!eZ(eee e!e"e#e$e&e'e(d"
Z)dS )#z
This module contains loss classes suitable for fitting.

It is not part of the public API.
Specific losses are used for regression, binary classification or multiclass
classification.
    Nxlogy   )check_scalar)_weighted_percentile   )CyAbsoluteErrorCyExponentialLossCyHalfBinomialLossCyHalfGammaLossCyHalfMultinomialLossCyHalfPoissonLossCyHalfSquaredErrorCyHalfTweedieLossCyHalfTweedieLossIdentityCyHuberLossCyPinballLoss)HalfLogitLinkIdentityLinkInterval	LogitLinkLogLinkMultinomialLogitc                   @   s   e Zd ZdZdZdZdZdddZdd Zd	d
 Z	dddZ
dddZd ddZd!ddZd"ddZd#ddZd$ddZejdfddZdS )%BaseLossa  Base class for a loss function of 1-dimensional targets.

    Conventions:

        - y_true.shape = sample_weight.shape = (n_samples,)
        - y_pred.shape = raw_prediction.shape = (n_samples,)
        - If is_multiclass is true (multiclass classification), then
          y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
          Note that this corresponds to the return value of decision_function.

    y_true, y_pred, sample_weight and raw_prediction must either be all float64
    or all float32.
    gradient and hessian must be either both float64 or both float32.

    Note that y_pred = link.inverse(raw_prediction).

    Specific loss classes can inherit specific link classes to satisfy
    BaseLink's abstractmethods.

    Parameters
    ----------
    sample_weight : {None, ndarray}
        If sample_weight is None, the hessian might be constant.
    n_classes : {None, int}
        The number of classes for classification, else None.

    Attributes
    ----------
    closs: CyLossFunction
    link : BaseLink
    interval_y_true : Interval
        Valid interval for y_true
    interval_y_pred : Interval
        Valid Interval for y_pred
    differentiable : bool
        Indicates whether or not loss function is differentiable in
        raw_prediction everywhere.
    need_update_leaves_values : bool
        Indicates whether decision trees in gradient boosting need to uptade
        leave values after having been fit to the (negative) gradients.
    approx_hessian : bool
        Indicates whether the hessian is approximated or exact. If,
        approximated, it should be larger or equal to the exact one.
    constant_hessian : bool
        Indicates whether the hessian is one for this loss.
    is_multiclass : bool
        Indicates whether n_classes > 2 is allowed.
    FTNc                 C   sB   || _ || _d| _d| _|| _ttj tjdd| _| jj	| _	d S )NF)
closslinkapprox_hessianconstant_hessian	n_classesr   npinfinterval_y_trueinterval_y_pred)selfr   r   r    r$   Q/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/sklearn/_loss/loss.py__init__   s    zBaseLoss.__init__c                 C   s   | j |S zuReturn True if y is in the valid range of y_true.

        Parameters
        ----------
        y : ndarray
        )r!   includesr#   yr$   r$   r%   in_y_true_range   s    zBaseLoss.in_y_true_rangec                 C   s   | j |S )zuReturn True if y is in the valid range of y_pred.

        Parameters
        ----------
        y : ndarray
        )r"   r(   r)   r$   r$   r%   in_y_pred_range   s    zBaseLoss.in_y_pred_ranger   c                 C   sJ   |dkrt |}|jdkr4|jd dkr4|d}| jj|||||dS )aJ  Compute the pointwise loss value for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        loss_out : None or C-contiguous array of shape (n_samples,)
            A location into which the result is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : array of shape (n_samples,)
            Element-wise loss function.
        Nr   r   y_trueraw_predictionsample_weightloss_out	n_threads)r   
empty_likendimshapesqueezer   loss)r#   r.   r/   r0   r1   r2   r$   r$   r%   r7      s    

zBaseLoss.lossc                 C   s   |dkr8|dkr&t |}t |}qPt j||jd}n|dkrPt j||jd}|jdkrr|jd dkrr|d}|jdkr|jd dkr|d}| jj||||||dS )a  Compute loss and gradient w.r.t. raw_prediction for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        loss_out : None or C-contiguous array of shape (n_samples,)
            A location into which the loss is stored. If None, a new array
            might be created.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : array of shape (n_samples,)
            Element-wise loss function.

        gradient : array of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.
        Ndtyper   r   )r.   r/   r0   r1   gradient_outr2   )r   r3   r9   r4   r5   r6   r   loss_gradient)r#   r.   r/   r0   r1   r:   r2   r$   r$   r%   r;      s&    &


zBaseLoss.loss_gradientc                 C   sl   |dkrt |}|jdkr4|jd dkr4|d}|jdkrV|jd dkrV|d}| jj|||||dS )a  Compute gradient of loss w.r.t raw_prediction for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the result is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : array of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.
        Nr   r   )r.   r/   r0   r:   r2   )r   r3   r4   r5   r6   r   gradient)r#   r.   r/   r0   r:   r2   r$   r$   r%   r<     s    


zBaseLoss.gradientc                 C   s   |dkr2|dkr&t |}t |}qDt |}n|dkrDt |}|jdkrf|jd dkrf|d}|jdkr|jd dkr|d}|jdkr|jd dkr|d}| jj||||||dS )a  Compute gradient and hessian of loss w.r.t raw_prediction.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        hessian_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the hessian is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : arrays of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.

        hessian : arrays of shape (n_samples,) or (n_samples, n_classes)
            Element-wise hessians.
        Nr   r   )r.   r/   r0   r:   hessian_outr2   )r   r3   r4   r5   r6   r   gradient_hessian)r#   r.   r/   r0   r:   r=   r2   r$   r$   r%   r>   6  s*    '




zBaseLoss.gradient_hessianc                 C   s   t j| j||dd|d|dS )a{  Compute the weighted average loss.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : float
            Mean or averaged loss function.
        Nr-   weights)r   averager7   )r#   r.   r/   r0   r2   r$   r$   r%   __call__w  s    zBaseLoss.__call__c                 C   s   t j||dd}dt |jj }| jjt j kr8d}n| jjrJ| jj}n| jj| }| jj	t jkrjd}n| jj
r|| jj	}n| jj	| }|dkr|dkr| j|S | jt |||S dS )a#  Compute raw_prediction of an intercept-only model.

        This can be used as initial estimates of predictions, i.e. before the
        first iteration in fit.

        Parameters
        ----------
        y_true : array-like of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or array of shape (n_samples,)
            Sample weights.

        Returns
        -------
        raw_prediction : numpy scalar or array of shape (n_classes,)
            Raw predictions of an intercept-only model.
        r   r@   axis
   N)r   rA   finfor9   epsr"   lowr    Zlow_inclusivehighZhigh_inclusiver   clip)r#   r.   r0   Zy_predrG   Za_minZa_maxr$   r$   r%   fit_intercept_only  s    

zBaseLoss.fit_intercept_onlyc                 C   s
   t |S )zpCalculate term dropped in loss.

        With this term added, the loss of perfect predictions is zero.
        )r   Z
zeros_liker#   r.   r0   r$   r$   r%   constant_to_optimal_zero  s    z!BaseLoss.constant_to_optimal_zeroFc                 C   sv   |t jt jfkr td| d| jr2|| jf}n|f}t j|||d}| jr^t jd|d}nt j|||d}||fS )au  Initialize arrays for gradients and hessians.

        Unless hessians are constant, arrays are initialized with undefined values.

        Parameters
        ----------
        n_samples : int
            The number of samples, usually passed to `fit()`.
        dtype : {np.float64, np.float32}, default=np.float64
            The dtype of the arrays gradient and hessian.
        order : {'C', 'F'}, default='F'
            Order of the arrays gradient and hessian. The default 'F' makes the arrays
            contiguous along samples.

        Returns
        -------
        gradient : C-contiguous array of shape (n_samples,) or array of shape             (n_samples, n_classes)
            Empty array (allocated but not initialized) to be used as argument
            gradient_out.
        hessian : C-contiguous array of shape (n_samples,), array of shape
            (n_samples, n_classes) or shape (1,)
            Empty (allocated but not initialized) array to be used as argument
            hessian_out.
            If constant_hessian is True (e.g. `HalfSquaredError`), the array is
            initialized to ``1``.
        zCValid options for 'dtype' are np.float32 and np.float64. Got dtype=z	 instead.)r5   r9   order)r   )r5   r9   )	r   Zfloat32float64
ValueErroris_multiclassr   emptyr   Zones)r#   Z	n_samplesr9   rO   r5   r<   Zhessianr$   r$   r%   init_gradient_and_hessian  s    
z"BaseLoss.init_gradient_and_hessian)N)NNr   )NNNr   )NNr   )NNNr   )Nr   )N)N)__name__
__module____qualname____doc__need_update_leaves_valuesdifferentiablerR   r&   r+   r,   r7   r;   r<   r>   rB   rK   rM   r   rP   rT   r$   r$   r$   r%   r   B   s:   :
		   
0    
B   
4    
A

*
r   c                       s"   e Zd ZdZd fdd	Z  ZS )HalfSquaredErrora  Half squared error with identity link, for regression.

    Domain:
    y_true and y_pred all real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, half squared error is defined as::

        loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2

    The factor of 0.5 simplifies the computation of gradients and results in a
    unit hessian (and is consistent with what is done in LightGBM). It is also
    half the Normal distribution deviance.
    Nc                    s"   t  jt t d |d k| _d S )Nr   r   )superr&   r   r   r   r#   r0   	__class__r$   r%   r&     s    zHalfSquaredError.__init__)NrU   rV   rW   rX   r&   __classcell__r$   r$   r_   r%   r[     s   r[   c                       s4   e Zd ZdZdZdZd	 fdd	Zd
ddZ  ZS )AbsoluteErrora  Absolute error with identity link, for regression.

    Domain:
    y_true and y_pred all real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, the absolute error is defined as::

        loss(x_i) = |y_true_i - raw_prediction_i|
    FTNc                    s(   t  jt t d d| _|d k| _d S )Nr\   T)r]   r&   r   r   r   r   r^   r_   r$   r%   r&   '  s    zAbsoluteError.__init__c                 C   s&   |dkrt j|ddS t||dS dS )Compute raw_prediction of an intercept-only model.

        This is the weighted median of the target, i.e. over the samples
        axis=0.
        Nr   rD   2   )r   medianr   rL   r$   r$   r%   rK   ,  s    z AbsoluteError.fit_intercept_only)N)N	rU   rV   rW   rX   rZ   rY   r&   rK   rb   r$   r$   r_   r%   rc     s
   rc   c                       s4   e Zd ZdZdZdZd
 fdd	Zddd	Z  ZS )PinballLossa  Quantile loss aka pinball loss, for regression.

    Domain:
    y_true and y_pred all real numbers
    quantile in (0, 1)

    Link:
    y_pred = raw_prediction

    For a given sample x_i, the pinball loss is defined as::

        loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i)

        rho_{quantile}(u) = u * (quantile - 1_{u<0})
                          = -u *(1 - quantile)  if u < 0
                             u * quantile       if u >= 0

    Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError().

    Additional Attributes
    ---------------------
    quantile : float
        The quantile level of the quantile to be estimated. Must be in range (0, 1).
    FTN      ?c                    sF   t |dtjdddd t jtt|dt d d| _|d k| _	d S )	Nquantiler   r   neitherZtarget_typeZmin_valZmax_valZinclude_boundaries)rk   r\   T)
r   numbersRealr]   r&   r   floatr   r   r   )r#   r0   rk   r_   r$   r%   r&   U  s    zPinballLoss.__init__c                 C   s8   |dkr t j|d| jj ddS t||d| jj S dS )rd   Nd   r   re   )r   
percentiler   rk   r   rL   r$   r$   r%   rK   e  s      
zPinballLoss.fit_intercept_only)Nrj   )Nrh   r$   r$   r_   r%   ri   8  s
   ri   c                       s4   e Zd ZdZdZdZd fdd	Zdd	d
Z  ZS )	HuberLossa  Huber loss, for regression.

    Domain:
    y_true and y_pred all real numbers
    quantile in (0, 1)

    Link:
    y_pred = raw_prediction

    For a given sample x_i, the Huber loss is defined as::

        loss(x_i) = 1/2 * abserr**2            if abserr <= delta
                    delta * (abserr - delta/2) if abserr > delta

        abserr = |y_true_i - raw_prediction_i|
        delta = quantile(abserr, self.quantile)

    Note: HuberLoss(quantile=1) equals HalfSquaredError and HuberLoss(quantile=0)
    equals delta * (AbsoluteError() - delta/2).

    Additional Attributes
    ---------------------
    quantile : float
        The quantile level which defines the breaking point `delta` to distinguish
        between absolute error and squared error. Must be in range (0, 1).

     Reference
    ---------
    .. [1] Friedman, J.H. (2001). :doi:`Greedy function approximation: A gradient
      boosting machine <10.1214/aos/1013203451>`.
      Annals of Statistics, 29, 1189-1232.
    FTN?rj   c                    sH   t |dtjdddd || _t jtt|dt d d| _	d	| _
d S )
Nrk   r   r   rl   rm   )deltar\   TF)r   rn   ro   rk   r]   r&   r   rp   r   r   r   )r#   r0   rk   ru   r_   r$   r%   r&     s    zHuberLoss.__init__c                 C   s`   |dkrt j|ddd}nt||d}|| }t |t | jjt | }|t j||d S )rd   Nrf   r   re   r?   )	r   rr   r   signminimumr   ru   absrA   )r#   r.   r0   rg   difftermr$   r$   r%   rK     s    
 zHuberLoss.fit_intercept_only)Nrt   rj   )Nrh   r$   r$   r_   r%   rs   s  s
   !rs   c                       s,   e Zd ZdZd fdd	ZdddZ  ZS )	HalfPoissonLossa  Half Poisson deviance loss with log-link, for regression.

    Domain:
    y_true in non-negative real numbers
    y_pred in positive real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half the Poisson deviance is defined as::

        loss(x_i) = y_true_i * log(y_true_i/exp(raw_prediction_i))
                    - y_true_i + exp(raw_prediction_i)

    Half the Poisson deviance is actually the negative log-likelihood up to
    constant terms (not involving raw_prediction) and simplifies the
    computation of the gradients.
    We also skip the constant term `y_true_i * log(y_true_i) - y_true_i`.
    Nc                    s*   t  jt t d tdtjdd| _d S )Nr\   r   TF)r]   r&   r   r   r   r   r    r!   r^   r_   r$   r%   r&     s    zHalfPoissonLoss.__init__c                 C   s"   t ||| }|d k	r||9 }|S )Nr   r#   r.   r0   rz   r$   r$   r%   rM     s    z(HalfPoissonLoss.constant_to_optimal_zero)N)NrU   rV   rW   rX   r&   rM   rb   r$   r$   r_   r%   r{     s   r{   c                       s,   e Zd ZdZd fdd	ZdddZ  ZS )	HalfGammaLossaV  Half Gamma deviance loss with log-link, for regression.

    Domain:
    y_true and y_pred in positive real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half Gamma deviance loss is defined as::

        loss(x_i) = log(exp(raw_prediction_i)/y_true_i)
                    + y_true/exp(raw_prediction_i) - 1

    Half the Gamma deviance is actually proportional to the negative log-
    likelihood up to constant terms (not involving raw_prediction) and
    simplifies the computation of the gradients.
    We also skip the constant term `-log(y_true_i) - 1`.
    Nc                    s*   t  jt t d tdtjdd| _d S )Nr\   r   F)r]   r&   r   r   r   r   r    r!   r^   r_   r$   r%   r&     s    zHalfGammaLoss.__init__c                 C   s$   t | d }|d k	r ||9 }|S Nr   )r   logr|   r$   r$   r%   rM     s    z&HalfGammaLoss.constant_to_optimal_zero)N)Nr}   r$   r$   r_   r%   r~     s   r~   c                       s,   e Zd ZdZd fdd	Zd	ddZ  ZS )
HalfTweedieLossa  Half Tweedie deviance loss with log-link, for regression.

    Domain:
    y_true in real numbers for power <= 0
    y_true in non-negative real numbers for 0 < power < 2
    y_true in positive real numbers for 2 <= power
    y_pred in positive real numbers
    power in real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half Tweedie deviance loss with p=power is defined
    as::

        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
                    - y_true_i * exp(raw_prediction_i)**(1-p) / (1-p)
                    + exp(raw_prediction_i)**(2-p) / (2-p)

    Taking the limits for p=0, 1, 2 gives HalfSquaredError with a log link,
    HalfPoissonLoss and HalfGammaLoss.

    We also skip constant terms, but those are different for p=0, 1, 2.
    Therefore, the loss is not continuous in `power`.

    Note furthermore that although no Tweedie distribution exists for
    0 < power < 1, it still gives a strictly consistent scoring function for
    the expectation.
    N      ?c                    sv   t  jtt|dt d | jjdkr@ttj	 tj	dd| _
n2| jjdk r`tdtj	dd| _
ntdtj	dd| _
d S N)powerr\   r   Fr   T)r]   r&   r   rp   r   r   r   r   r   r    r!   r#   r0   r   r_   r$   r%   r&     s    zHalfTweedieLoss.__init__c                 C   s   | j jdkrt j||dS | j jdkr8t j||dS | j jdkrTt j||dS | j j}tt|dd| d|  d|  }|d k	r||9 }|S d S )Nr   )r.   r0   r   r   )r   r   r[   rM   r{   r~   r   maximum)r#   r.   r0   prz   r$   r$   r%   rM   &  s(       (z(HalfTweedieLoss.constant_to_optimal_zero)Nr   )Nr}   r$   r$   r_   r%   r     s   r   c                       s"   e Zd ZdZd fdd	Z  ZS )HalfTweedieLossIdentityan  Half Tweedie deviance loss with identity link, for regression.

    Domain:
    y_true in real numbers for power <= 0
    y_true in non-negative real numbers for 0 < power < 2
    y_true in positive real numbers for 2 <= power
    y_pred in positive real numbers for power != 0
    y_pred in real numbers for power = 0
    power in real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, half Tweedie deviance loss with p=power is defined
    as::

        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
                    - y_true_i * raw_prediction_i**(1-p) / (1-p)
                    + raw_prediction_i**(2-p) / (2-p)

    Note that the minimum value of this loss is 0.

    Note furthermore that although no Tweedie distribution exists for
    0 < power < 1, it still gives a strictly consistent scoring function for
    the expectation.
    Nr   c                    s   t  jtt|dt d | jjdkr@ttj	 tj	dd| _
n2| jjdk r`tdtj	dd| _
ntdtj	dd| _
| jjdkrttj	 tj	dd| _ntdtj	dd| _d S r   )r]   r&   r   rp   r   r   r   r   r   r    r!   r"   r   r_   r$   r%   r&   W  s    z HalfTweedieLossIdentity.__init__)Nr   ra   r$   r$   r_   r%   r   ;  s   r   c                       s4   e Zd ZdZd	 fdd	Zd
ddZdd Z  ZS )HalfBinomialLossaY  Half Binomial deviance loss with logit link, for binary classification.

    This is also know as binary cross entropy, log-loss and logistic loss.

    Domain:
    y_true in [0, 1], i.e. regression on the unit interval
    y_pred in (0, 1), i.e. boundaries excluded

    Link:
    y_pred = expit(raw_prediction)

    For a given sample x_i, half Binomial deviance is defined as the negative
    log-likelihood of the Binomial/Bernoulli distribution and can be expressed
    as::

        loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i

    See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
    section 4.4.1 (about logistic regression).

    Note that the formulation works for classification, y = {0, 1}, as well as
    logistic regression, y = [0, 1].
    If you add `constant_to_optimal_zero` to the loss, you get half the
    Bernoulli/binomial deviance.

    More details: Inserting the predicted probability y_pred = expit(raw_prediction)
    in the loss gives the well known::

        loss(x_i) = - y_true_i * log(y_pred_i) - (1 - y_true_i) * log(1 - y_pred_i)
    Nc                    s*   t  jt t dd tdddd| _d S Nr   r   r   r   r   r   T)r]   r&   r
   r   r   r!   r^   r_   r$   r%   r&     s    zHalfBinomialLoss.__init__c                 C   s0   t ||t d| d|  }|d k	r,||9 }|S r   r   r|   r$   r$   r%   rM     s    z)HalfBinomialLoss.constant_to_optimal_zeroc                 C   sx   |j dkr"|jd dkr"|d}tj|jd df|jd}| j||dddf< d|dddf  |dddf< |S a=  Predict probabilities.

        Parameters
        ----------
        raw_prediction : array of shape (n_samples,) or (n_samples, 1)
            Raw prediction values (in link space).

        Returns
        -------
        proba : array of shape (n_samples, 2)
            Element-wise class probabilities.
        r   r   r   r8   Nr4   r5   r6   r   rS   r9   r   inverser#   r/   Zprobar$   r$   r%   predict_proba  s    
 zHalfBinomialLoss.predict_proba)N)NrU   rV   rW   rX   r&   rM   r   rb   r$   r$   r_   r%   r   i  s   
r   c                       sJ   e Zd ZdZdZd fdd	Zdd Zdd	d
Zdd ZdddZ	  Z
S )HalfMultinomialLossa  Categorical cross-entropy loss, for multiclass classification.

    Domain:
    y_true in {0, 1, 2, 3, .., n_classes - 1}
    y_pred has n_classes elements, each element in (0, 1)

    Link:
    y_pred = softmax(raw_prediction)

    Note: We assume y_true to be already label encoded. The inverse link is
    softmax. But the full link function is the symmetric multinomial logit
    function.

    For a given sample x_i, the categorical cross-entropy loss is defined as
    the negative log-likelihood of the multinomial distribution, it
    generalizes the binary cross-entropy to more than 2 classes::

        loss_i = log(sum(exp(raw_pred_{i, k}), k=0..n_classes-1))
                - sum(y_true_{i, k} * raw_pred_{i, k}, k=0..n_classes-1)

    See [1].

    Note that for the hessian, we calculate only the diagonal part in the
    classes: If the full hessian for classes k and l and sample i is H_i_k_l,
    we calculate H_i_k_k, i.e. k=l.

    Reference
    ---------
    .. [1] :arxiv:`Simon, Noah, J. Friedman and T. Hastie.
        "A Blockwise Descent Algorithm for Group-penalized Multiresponse and
        Multinomial Regression".
        <1311.6529>`
    TN   c                    s<   t  jt t |d tdtjdd| _tdddd| _d S )Nr   r   TFr   )	r]   r&   r   r   r   r   r    r!   r"   )r#   r0   r   r_   r$   r%   r&     s    zHalfMultinomialLoss.__init__c                 C   s    | j |ot|t|kS r'   )r!   r(   r   allZastypeintr)   r$   r$   r%   r+     s    z#HalfMultinomialLoss.in_y_true_rangec                 C   s   t j| j|jd}t |jj}t| jD ]6}t j||k|dd||< t || |d| ||< q*| j		|dddf 
dS )zCompute raw_prediction of an intercept-only model.

        This is the softmax of the weighted average of the target, i.e. over
        the samples axis=0.
        r8   r   rC   r   N)r   Zzerosr   r9   rF   rG   rangerA   rJ   r   Zreshape)r#   r.   r0   outrG   kr$   r$   r%   rK     s    z&HalfMultinomialLoss.fit_intercept_onlyc                 C   s   | j |S )a=  Predict probabilities.

        Parameters
        ----------
        raw_prediction : array of shape (n_samples, n_classes)
            Raw prediction values (in link space).

        Returns
        -------
        proba : array of shape (n_samples, n_classes)
            Element-wise class probabilities.
        )r   r   )r#   r/   r$   r$   r%   r     s    z!HalfMultinomialLoss.predict_probar   c                 C   s\   |dkr2|dkr&t |}t |}qDt |}n|dkrDt |}| jj||||||dS )aK  Compute gradient and class probabilities fow raw_prediction.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : array of shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or array of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        proba_out : None or array of shape (n_samples, n_classes)
            A location into which the class probabilities are stored. If None,
            a new array might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : array of shape (n_samples, n_classes)
            Element-wise gradients.

        proba : array of shape (n_samples, n_classes)
            Element-wise class probabilities.
        N)r.   r/   r0   r:   	proba_outr2   )r   r3   r   gradient_proba)r#   r.   r/   r0   r:   r   r2   r$   r$   r%   r     s    $

z"HalfMultinomialLoss.gradient_proba)Nr   )N)NNNr   )rU   rV   rW   rX   rR   r&   r+   rK   r   r   rb   r$   r$   r_   r%   r     s   "		
    r   c                       s4   e Zd ZdZd	 fdd	Zd
ddZdd Z  ZS )ExponentialLossa"  Exponential loss with (half) logit link, for binary classification.

    This is also know as boosting loss.

    Domain:
    y_true in [0, 1], i.e. regression on the unit interval
    y_pred in (0, 1), i.e. boundaries excluded

    Link:
    y_pred = expit(2 * raw_prediction)

    For a given sample x_i, the exponential loss is defined as::

        loss(x_i) = y_true_i * exp(-raw_pred_i)) + (1 - y_true_i) * exp(raw_pred_i)

    See:
    - J. Friedman, T. Hastie, R. Tibshirani.
      "Additive logistic regression: a statistical view of boosting (With discussion
      and a rejoinder by the authors)." Ann. Statist. 28 (2) 337 - 407, April 2000.
      https://doi.org/10.1214/aos/1016218223
    - A. Buja, W. Stuetzle, Y. Shen. (2005).
      "Loss Functions for Binary Class Probability Estimation and Classification:
      Structure and Applications."

    Note that the formulation works for classification, y = {0, 1}, as well as
    "exponential logistic" regression, y = [0, 1].
    Note that this is a proper scoring rule, but without it's canonical link.

    More details: Inserting the predicted probability
    y_pred = expit(2 * raw_prediction) in the loss gives::

        loss(x_i) = y_true_i * sqrt((1 - y_pred_i) / y_pred_i)
            + (1 - y_true_i) * sqrt(y_pred_i / (1 - y_pred_i))
    Nc                    s*   t  jt t dd tdddd| _d S r   )r]   r&   r	   r   r   r!   r^   r_   r$   r%   r&   \  s    zExponentialLoss.__init__c                 C   s*   dt |d|   }|d k	r&||9 }|S )Nr   )r   sqrtr|   r$   r$   r%   rM   d  s    z(ExponentialLoss.constant_to_optimal_zeroc                 C   sx   |j dkr"|jd dkr"|d}tj|jd df|jd}| j||dddf< d|dddf  |dddf< |S r   r   r   r$   r$   r%   r   k  s    
 zExponentialLoss.predict_proba)N)Nr   r$   r$   r_   r%   r   8  s   #
r   )
Zsquared_errorZabsolute_errorZpinball_lossZ
huber_lossZpoisson_lossZ
gamma_lossZtweedie_lossZbinomial_lossZmultinomial_lossZexponential_loss)*rX   rn   numpyr   Zscipy.specialr   utilsr   Zutils.statsr   Z_lossr   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r[   rc   ri   rs   r{   r~   r   r   r   r   r   Z_LOSSESr$   r$   r$   r%   <module>   sD   4    @";I @.E J