U
    ¿9%eÆ=  ã                   @   sö   d Z ddlZddlmZ ddlmZmZ ddlZddl	m
Z
 ddlmZ ddlmZ ddlmZ d	d
lmZmZ d	dlmZ d	dlmZ d	dlmZ d	dlmZmZ ddlmZ e ej ¡j!Z"dd„ Z#ddd„Z$dd„ Z%dd„ Z&G dd„ deeƒZ'dS )z<
A Theil-Sen Estimator for Multiple Linear Regression Model
é    N)Úcombinations)ÚIntegralÚReal)Úeffective_n_jobs)Úlinalg)Úget_lapack_funcs)Úbinomé   )ÚRegressorMixinÚ_fit_context)ÚConvergenceWarning)Úcheck_random_state)ÚInterval)ÚParallelÚdelayedé   )ÚLinearModelc                 C   sÞ   | | }t  t j|d dd¡}|tk}t| ¡ | jd k ƒ}|| }|| dd…t jf }t t j|| dd¡}|tkr®t j| |dd…f | ddt jd| dd }nd}d}t	dd||  ƒ| t
d|| ƒ|  S )u	  Modified Weiszfeld step.

    This function defines one iteration step in order to approximate the
    spatial median (L1 median). It is a form of an iteratively re-weighted
    least squares method.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        Training vector, where `n_samples` is the number of samples and
        `n_features` is the number of features.

    x_old : ndarray of shape = (n_features,)
        Current start vector.

    Returns
    -------
    x_new : ndarray of shape (n_features,)
        New iteration step.

    References
    ----------
    - On Computation of Spatial Median for Robust Data Mining, 2005
      T. KÃ¤rkkÃ¤inen and S. Ã„yrÃ¤mÃ¶
      http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf
    r	   r   ©Zaxisr   Ng      ð?ç        )ÚnpÚsqrtÚsumÚ_EPSILONÚintÚshapeZnewaxisr   ZnormÚmaxÚmin)ÚXZx_oldÚdiffZ	diff_normÚmaskZis_x_old_in_XZquotient_normZnew_direction© r    ú^/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/sklearn/linear_model/_theil_sen.pyÚ_modified_weiszfeld_step   s$      ÿ
ÿÿr"   é,  çü©ñÒMbP?c                 C   sŽ   | j d dkr$dtj|  ¡ ddfS |dC }tj| dd}t|ƒD ].}t| |ƒ}t || d ¡|k rl q†qB|}qBt 	dj
|dt¡ ||fS )	u	  Spatial median (L1 median).

    The spatial median is member of a class of so-called M-estimators which
    are defined by an optimization problem. Given a number of p points in an
    n-dimensional space, the point x minimizing the sum of all distances to the
    p other points is called spatial median.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        Training vector, where `n_samples` is the number of samples and
        `n_features` is the number of features.

    max_iter : int, default=300
        Maximum number of iterations.

    tol : float, default=1.e-3
        Stop the algorithm if spatial_median has converged.

    Returns
    -------
    spatial_median : ndarray of shape = (n_features,)
        Spatial median.

    n_iter : int
        Number of iterations needed.

    References
    ----------
    - On Computation of Spatial Median for Robust Data Mining, 2005
      T. KÃ¤rkkÃ¤inen and S. Ã„yrÃ¤mÃ¶
      http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf
    r   T)Zkeepdimsr	   r   r   zYMaximum number of iterations {max_iter} reached in spatial median for TheilSen regressor.)Úmax_iter)r   r   ZmedianZravelZmeanÚranger"   r   ÚwarningsÚwarnÚformatr   )r   r%   ÚtolZspatial_median_oldZn_iterZspatial_medianr    r    r!   Ú_spatial_medianQ   s     "
þür+   c                 C   s(   ddd|  | | d  | d |   S )a  Approximation of the breakdown point.

    Parameters
    ----------
    n_samples : int
        Number of samples.

    n_subsamples : int
        Number of subsamples to consider.

    Returns
    -------
    breakdown_point : float
        Approximation of breakdown point.
    r   g      à?r    )Ú	n_samplesÚn_subsamplesr    r    r!   Ú_breakdown_point‰   s    ÿþûÿÿr.   c                 C   sÂ   t |ƒ}| jd | }|jd }t |jd |f¡}t ||f¡}t t||ƒ¡}td||fƒ\}	t|ƒD ]R\}
}| |dd…f |dd…|d…f< || |d|…< |	||ƒd d|… ||
< qj|S )a  Least Squares Estimator for TheilSenRegressor class.

    This function calculates the least squares method on a subset of rows of X
    and y defined by the indices array. Optionally, an intercept column is
    added if intercept is set to true.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        Design matrix, where `n_samples` is the number of samples and
        `n_features` is the number of features.

    y : ndarray of shape (n_samples,)
        Target vector, where `n_samples` is the number of samples.

    indices : ndarray of shape (n_subpopulation, n_subsamples)
        Indices of all subsamples with respect to the chosen subpopulation.

    fit_intercept : bool
        Fit intercept or not.

    Returns
    -------
    weights : ndarray of shape (n_subpopulation, n_features + intercept)
        Solution matrix of n_subpopulation solved least square problems.
    r   r   )ZgelssN)	r   r   r   ÚemptyZonesZzerosr   r   Ú	enumerate)r   ÚyÚindicesÚfit_interceptÚ
n_featuresr-   ÚweightsZX_subpopulationZy_subpopulationZlstsqÚindexZsubsetr    r    r!   Ú_lstsq¤   s    
 r7   c                   @   s¤   e Zd ZU dZdgdgeeddddgdegeeddddgeeddddgd	gdegd
gdœ	Zee	d< ddddddddddœ	dd„Z
dd„ Zedddd„ ƒZdS )ÚTheilSenRegressoraR  Theil-Sen Estimator: robust multivariate regression model.

    The algorithm calculates least square solutions on subsets with size
    n_subsamples of the samples in X. Any value of n_subsamples between the
    number of features and samples leads to an estimator with a compromise
    between robustness and efficiency. Since the number of least square
    solutions is "n_samples choose n_subsamples", it can be extremely large
    and can therefore be limited with max_subpopulation. If this limit is
    reached, the subsets are chosen randomly. In a final step, the spatial
    median (or L1 median) is calculated of all least square solutions.

    Read more in the :ref:`User Guide <theil_sen_regression>`.

    Parameters
    ----------
    fit_intercept : bool, default=True
        Whether to calculate the intercept for this model. If set
        to false, no intercept will be used in calculations.

    copy_X : bool, default=True
        If True, X will be copied; else, it may be overwritten.

    max_subpopulation : int, default=1e4
        Instead of computing with a set of cardinality 'n choose k', where n is
        the number of samples and k is the number of subsamples (at least
        number of features), consider only a stochastic subpopulation of a
        given maximal size if 'n choose k' is larger than max_subpopulation.
        For other than small problem sizes this parameter will determine
        memory usage and runtime if n_subsamples is not changed. Note that the
        data type should be int but floats such as 1e4 can be accepted too.

    n_subsamples : int, default=None
        Number of samples to calculate the parameters. This is at least the
        number of features (plus 1 if fit_intercept=True) and the number of
        samples as a maximum. A lower number leads to a higher breakdown
        point and a low efficiency while a high number leads to a low
        breakdown point and a high efficiency. If None, take the
        minimum number of subsamples leading to maximal robustness.
        If n_subsamples is set to n_samples, Theil-Sen is identical to least
        squares.

    max_iter : int, default=300
        Maximum number of iterations for the calculation of spatial median.

    tol : float, default=1e-3
        Tolerance when calculating spatial median.

    random_state : int, RandomState instance or None, default=None
        A random number generator instance to define the state of the random
        permutations generator. Pass an int for reproducible output across
        multiple function calls.
        See :term:`Glossary <random_state>`.

    n_jobs : int, default=None
        Number of CPUs to use during the cross validation.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    verbose : bool, default=False
        Verbose mode when fitting the model.

    Attributes
    ----------
    coef_ : ndarray of shape (n_features,)
        Coefficients of the regression model (median of distribution).

    intercept_ : float
        Estimated intercept of regression model.

    breakdown_ : float
        Approximated breakdown point.

    n_iter_ : int
        Number of iterations needed for the spatial median.

    n_subpopulation_ : int
        Number of combinations taken into account from 'n choose k', where n is
        the number of samples and k is the number of subsamples.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    HuberRegressor : Linear regression model that is robust to outliers.
    RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.
    SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.

    References
    ----------
    - Theil-Sen Estimators in a Multiple Linear Regression Model, 2009
      Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang
      http://home.olemiss.edu/~xdang/papers/MTSE.pdf

    Examples
    --------
    >>> from sklearn.linear_model import TheilSenRegressor
    >>> from sklearn.datasets import make_regression
    >>> X, y = make_regression(
    ...     n_samples=200, n_features=2, noise=4.0, random_state=0)
    >>> reg = TheilSenRegressor(random_state=0).fit(X, y)
    >>> reg.score(X, y)
    0.9884...
    >>> reg.predict(X[:1,])
    array([-31.5871...])
    Úbooleanr   NÚleft)Úclosedr   r   Úrandom_stateÚverbose©	r3   Úcopy_XÚmax_subpopulationr-   r%   r*   r<   Ún_jobsr=   Ú_parameter_constraintsTg     ˆÃ@r#   r$   Fc       	   
      C   s:   || _ || _|| _|| _|| _|| _|| _|| _|	| _d S ©Nr>   )
Úselfr3   r?   r@   r-   r%   r*   r<   rA   r=   r    r    r!   Ú__init__R  s    zTheilSenRegressor.__init__c                 C   s¾   | j }| jr|d }n|}|d k	r†||kr:td ||¡ƒ‚||krl||kr„| jrTdnd}td |||¡ƒ‚q||krtd ||¡ƒ‚n
t||ƒ}tdt t||ƒ¡ƒ}t	t| j
|ƒƒ}||fS )Nr   z=Invalid parameter since n_subsamples > n_samples ({0} > {1}).z+1Ú zAInvalid parameter since n_features{0} > n_subsamples ({1} > {2}).z\Invalid parameter since n_subsamples != n_samples ({0} != {1}) while n_samples < n_features.)r-   r3   Ú
ValueErrorr)   r   r   r   Úrintr   r   r@   )rD   r,   r4   r-   Zn_dimZplus_1Zall_combinationsZn_subpopulationr    r    r!   Ú_check_subparamsi  sB    
 ÿÿ  þÿ þÿ
z"TheilSenRegressor._check_subparams)Zprefer_skip_nested_validationc           	         sp  t ˆjƒ‰ˆjˆ ˆdd\‰ ‰ˆ j\‰}ˆ ˆ|¡\‰ˆ_tˆˆƒˆ_ˆjr–t	d 
ˆj¡ƒ t	d 
ˆ¡ƒ tˆjˆ ƒ}t	d 
|¡ƒ t	d 
ˆj¡ƒ t tˆˆƒ¡ˆjkrÀtttˆƒˆƒƒ}n‡‡‡fdd„tˆjƒD ƒ}tˆjƒ}t ||¡‰t|ˆjd	‡ ‡‡‡fd
d„t|ƒD ƒƒ}t |¡}t|ˆjˆjd\ˆ_}ˆjr`|d ˆ_|dd… ˆ_ndˆ_|ˆ_ˆS )aU  Fit linear model.

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features)
            Training data.
        y : ndarray of shape (n_samples,)
            Target values.

        Returns
        -------
        self : returns an instance of self.
            Fitted `TheilSenRegressor` estimator.
        T)Z	y_numericzBreakdown point: {0}zNumber of samples: {0}zTolerable outliers: {0}zNumber of subpopulations: {0}c                    s   g | ]}ˆj ˆ ˆd d‘qS )F)ÚsizeÚreplace)Úchoice)Ú.0Ú_)r,   r-   r<   r    r!   Ú
<listcomp>±  s   ÿz)TheilSenRegressor.fit.<locals>.<listcomp>)rA   r=   c                 3   s&   | ]}t tƒˆ ˆˆ| ˆjƒV  qd S rC   )r   r7   r3   )rM   Zjob)r   Ú
index_listrD   r1   r    r!   Ú	<genexpr>¸  s   ÿz(TheilSenRegressor.fit.<locals>.<genexpr>)r%   r*   r   r   Nr   )r   r<   Z_validate_datar   rI   Zn_subpopulation_r.   Z
breakdown_r=   Úprintr)   r   r   rH   r   r@   Úlistr   r&   r   rA   Zarray_splitr   Zvstackr+   r%   r*   Zn_iter_r3   Z
intercept_Zcoef_)	rD   r   r1   r4   Ztol_outliersr2   rA   r5   Zcoefsr    )r   rP   r,   r-   r<   rD   r1   r!   ÚfitŽ  sH    

 ÿ
þ
þ
  ÿ
zTheilSenRegressor.fit)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r   rB   ÚdictÚ__annotations__rE   rI   r   rT   r    r    r    r!   r8   Ð   s0   
uöõ%r8   )r#   r$   )(rX   r'   Ú	itertoolsr   Únumbersr   r   Únumpyr   Zjoblibr   Zscipyr   Zscipy.linalg.lapackr   Zscipy.specialr   Úbaser
   r   Ú
exceptionsr   Úutilsr   Zutils._param_validationr   Zutils.parallelr   r   Ú_baser   ZfinfoÚdoubleZepsr   r"   r+   r.   r7   r8   r    r    r    r!   Ú<module>   s(   	3
8,