U
    9%eW                  
   @   s  d dl Zd dlZd dlmZmZ d dlmZ d dlm	Z	 d dl
mZmZmZmZmZ d dlmZ d dlmZmZmZ dd	 Zejd
ejd ddgejdgdfejddejgejdgdfejdddgedgdfdgejdddgejdddgdd Zejdejd gd  dgd   d!g gejdjd ddggfejdgd  dgd   dg gedjddd"ggfgejdd#dgd$d% Z ejd&ddd dgd'feddd gddd!ggjd(fdddgd)fgd*d+ Z!d,d- Z"d.d/ Z#ejd0d1d2gejdddgejdd3d4dgd5d6 Z$ejjd7ed8gd9 d8fed gd9 d fejd:gd9 edd fgddd;gd<ejddd#d=gd>d? Z%d@dA Z&dBdC Z'ejdd=dDdgdEdF Z(ej)dGejdd=dgdHdI Z*dS )J    N)assert_allcloseassert_array_equal)RandomForestRegressor)Ridge)KFoldShuffleSplitStratifiedKFoldcross_val_scoretrain_test_split)make_pipeline)KBinsDiscretizerLabelEncoderTargetEncoderc                 C   s   t j|t jd}t |}|dkrt |}t|D ]h}|| |k }|jd }	|	dkr`|||< q4t |}
|
| }|	|	|  }|t | d| |  ||< q4|S t|D ]<}|| |k }t |||  }|jd | }|| ||< q|S dS )z0Simple Python implementation of target encoding.Zdtypeautor      N)npZzerosfloat64meanvarrangeshapesum)	X_ordinaly_intn_categoriessmoothcur_encodingsy_meanZ
y_variancecZy_subsetZn_iZy_subset_variancemZlambda_Zcurrent_sumZcurrent_cnt r!   n/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/sklearn/preprocessing/tests/test_target_encoder.py_encode_target   s*    



 r#   zcategories, unknown_valuer      r            ?      @g      @catdogsnakebear)r      r   g      @r   target_typebinary
continuousc                 C   sj  d}t jdgd dgd  dgd  gt jdj}t jdddggt jdj}|jd }| d	krh|}	|}
n| d | }	| d | }
t |
|ggf}
t j|}d}|d
kr|jdd|d}t jddgt	d}|| }n|j
dd|d}|}||}|| }|	| }	|| }|| }|d
kr.t||dd}nt||dd}t j|t jd}|||D ]F\}}||df ||  }}t||||}|||df  ||df< qXt|| ||d}||	|}|j|kstt|| t|jdkstt |}t|dddf |||}t|jd | |jt|ks4tt |t |gfdd}||
}t|| dS )zCheck encoding for binary and continuous targets.

    Compare the values returned by `TargetEncoder.fit_transform` against the
    expected encodings for cv splits from a naive reference Python
    implementation in _encode_target.
    r,   r      r      r$   (   r   r   r.   lowhighsizer(   r)   T)n_splitsrandom_stateshuffle)r   
categoriescvr9   N)r   arrayint64Tr   concatenaterandomRandomStaterandintobjectuniformpermutationr   r   
empty_liker   splitr#   r   fit_transformtarget_type_AssertionErrorr   len
encodings_r   target_mean_pytestapproxreshape	transform)r;   Zunknown_valueglobal_random_seedr   r-   r   ZX_train_int_arrayZX_test_int_array	n_samplesX_trainX_testZdata_rngr8   r   Ztarget_namesy_trainZshuffled_idxr<   expected_X_fit_transform	train_idxtest_idxX_y_r   target_encoderX_fit_transformr   expected_encodingsexpected_X_test_transformX_test_transformr!   r!   r"   test_encoding4   s    .



  


    
rc   zX, categories
   r,   cow      @c                 C   s   t jd}|jdd| jd d}t||dd| |}| }|| dd }|d t	
|kshtt|jd	kszt|jd d t	
|kstdS )
zHCustom categories with unknown categories that are not in training data.r   r7   r0   r3   )r;   r   r9   r=   N)r   r   r   )r   rB   rC   rF   r   r   fitr   rS   rP   rQ   rL   rM   rN   )Xr;   r   rngyencr   X_transr!   r!   r"   test_custom_categories   s    rm   zy, msgz'Found input variables with inconsistentz7Target type was inferred to be 'multiclass-multioutput'+Target type was inferred to be 'multiclass'c              	   C   sD   t dddggj}t }tjt|d |||  W 5 Q R X dS )zCheck invalidate input.r   r   matchN)r   r>   r@   r   rP   raises
ValueErrorrJ   )rj   msgrh   rk   r!   r!   r"   test_errors   s    rt   c               	   C   s   t ddddddggj} t ddddddg}t }d}tjt|d || | W 5 Q R X td	d
}|| | |jd	kst	dS )z6Custom target_type to avoid inferring the target type.r   r   r&   g       @r'   rf   rn   ro   r/   )r-   N)
r   r>   r@   r   rP   rq   rr   rJ   rK   rL   )rh   rj   rk   rs   r!   r!   r"   test_use_regression_target   s    
ru   c                  C   s   t d} | ddgd ddgd d}ddgd }tddd	d
}|jdd tddd	d
}|jdd |||}|||}t| | t|	 ddg t|	 |j
 dS )z*Check TargetEncoder works with set_output.pandasabrd   r   r$   )ABr'   r   r<   r   r9   default)rS   ry   rz   N)rP   importorskip	DataFramer   Z
set_outputrJ   r   Zto_numpyr   Zget_feature_names_outcolumns)pdZX_dfrj   Zenc_defaultZ
enc_pandasZ	X_defaultZX_pandasr!   r!   r"   !test_feature_names_out_set_output   s    
 r   	to_pandasTFbinary-ints
binary-strc              
   C   s  t jddgddgddgddgddgddgddgddggt jd}|dkr~t ddddddddg}t |}tdddd	}np|d
krt ddddddddg}t |}tdddd	}n2t jddddddddgt jd}|}tdddd	}t |}dddgddgg}t jddgddgddggt jd}	| rt	
d}
|
|dddf t jddgtd|dddf  d}|
|	dddf dddgd}	n|}t j|t jd}t|D ]b\}}|||D ]J\}}|||f ||  }}t||t||}||||f  |||f< qʐqg }t|D ]2\}}t|dd|f |t||}|| q&t j|d d |d d g||d d g|d d |ggt jd}t|ddd}|||}t|| t|jdksttdD ]}t|j| ||  q||	}t|| dS )z,Check target encoder with multiple features.r   r   r$   r   r   rw   rx   T)r9   r:   r   r,   r%   r'   gffffff@g333333@g      @gffffff@g      @皙$@g333333@rd   rv   Nr(   r)   )Zfeat0Zfeat1r*   )r   r<   r9   )r   r>   r?   r   rJ   r   Zfloat32r   r   rP   r}   r~   rE   rH   r   	enumeraterI   r#   rM   appendr   r   rN   rL   r   rS   )r   r   r-   r   rX   Z	y_integerr<   r   r;   rW   r   rV   rY   Zf_idxZcatsrZ   r[   r\   r]   Zcurrent_encodingr`   ra   rk   r_   irb   r!   r!   r"   test_multiple_features_quick   s    2  
	
 $
   	

r   z	y, y_meang333333@r0   rw   zbinary-string)Zids        c           	      C   s   t dgd gj}|jd }td|dd}||| }t|t j|gg|dd |jd d t	
|ksnt|jt	
|kstt dgdgg}||}t|t j|ggddd dS )z5Check edge case where feature and target is constant.r   r0   r   r$   r{   ZaxisN)r   r>   r@   r   r   rJ   r   repeatrN   rP   rQ   rL   rO   rS   )	rj   r   r   rh   rU   rk   rl   rW   ZX_test_transr!   r!   r"    test_constant_target_and_featureI  s    

r   c                 C   s   d}d}t j| }|j|d}|jd||ddd}| }|| }|| }td| d}|||}td	d
}|||}	t	dd| d}
t
d| d}t|
|||d dk stt|
|||d dk stt|
|	||d dkstd S )Nr1   i  r6   r   r=   r   T)r:   r9   F)r:   rd   r0   )Zn_estimatorsZmin_samples_leafr9   2   )r8   r9   )r<   皙?      ?)r   rB   rC   normalrD   rR   Zargsortr   rJ   r   r   r	   r   rL   )rT   ZcardinalityrU   ri   rX   rV   Zy_sorted_indicesr^   ZX_encoded_train_shuffledZX_encoded_train_no_shuffledZ	regressorr<   r!   r!   r"   Ftest_fit_transform_not_associated_with_y_if_ordinal_categorical_is_notc  s6    
  		r   c                  C   s   t ddddddddddg
gj} t ddddddd	d
ddg
}tdddd}|| |}t|d t |dd  t|d t |dd  dS )zECheck edge case with zero smoothing and cv does not contain category.r   r   g @g333333@g333333?g@r&   g      "@r   gffffff,@g*@g      .@r   Fr$   )r   r:   r<      Nr=   )r   r>   r@   r   rJ   r   r   )rh   rj   rk   rl   r!   r!   r"   test_smooth_zero  s    "r   g     @@c                 C   s   t j|}|jdd}d}t|dd|dd}t|||d\}}}}	||}
|
|	t j
 }|
|	t j
 }t| |d	}|||}||}|||}||}t|| t|| d S )
Ni  r   r1   ordinal)n_binsencoder=   r   r9   r   r9   )r   rB   rC   r   r   rJ   rR   r
   rG   astypeint32r   rS   r   )r   rT   ri   rj   r   rh   rV   rW   rX   y_testpermutated_labelsZX_train_permutedZX_test_permutedr^   ZX_train_encodedZX_test_encodedZX_train_permuted_encodedZX_test_permuted_encodedr!   r!   r"   3test_invariance_of_encoding_under_label_permutation  s*    
  



r   z0ignore:In version 1.5 onwards, subsample=200_000c                 C   s"  t dddd}d}tj|}||}d|| }d}t|dd	|d
|| dd}||}	|	|	tj
 }||}
|jtd| |dddd}tj||
|gdd}t||dd\}}}}|||}|||dk st|||dk sttt| |d|||}|d j}|||dks<t||||dksVt||d tjdddksrtt|dd  dk  stt| |d||}||}||}|||}|j}|||dkst||||dk s t|t|d t|d k std S )Ngư>ZlsqrF)alphaZsolverZfit_interceptiP  g?d   r   rF   )r   r   Zstrategyr9   r=   r   g?T)r6   replacer   r   r   r   r   r   g{Gz?)absg?gffffff?r$   )r   r   rB   rC   Zrandnr   rJ   rR   rG   r   r   choiceintrA   r
   rg   ZscorerL   r   r   Zcoef_rP   rQ   r   allrS   )r   rT   Zlinear_regressionrU   ri   rj   noiser   ZX_informativer   Z
X_shuffledZX_near_unique_categoriesrh   rV   rW   rX   r   Z	raw_modelZmodel_with_cvZcoefr^   ZX_enc_no_cv_trainZX_enc_no_cv_testZmodel_no_cvr!   r!   r"   *test_target_encoding_for_linear_regression  sp    




   
  
  

r   )+numpyr   rP   Znumpy.testingr   r   Zsklearn.ensembler   Zsklearn.linear_modelr   Zsklearn.model_selectionr   r   r   r	   r
   Zsklearn.pipeliner   Zsklearn.preprocessingr   r   r   r#   markZparametrizer>   r?   nanr   rE   rc   r@   rm   rt   ru   r   r   r   r   r   r   filterwarningsr   r!   r!   r!   r"   <module>   s   	[(
 

	R	.
$
