U
    -e0                    @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZ d dlmZmZmZ dd Zejd	d
dgdd Zdd Zejd	d
dgdd Zejdejejejgejdejejejgdd Zejdejejejgdd Zdd Zdd Zdd Zdd Zd d! Z ejjd"d#d$d%gd&d'd%gge!d(d$d%gd)d'd%ggej!d*d+d,gd-d.d,gge"d/ej!d*d$d,gd-ej#d,gge"d/ej!d*d$d,gd-e$d0d,gge"d/ej!dd$d,gd-d'd,gge"d/ej!dd$dgd-ej#dgge"d/ej!dd$dgd-e$d0dgge"d/gd1d2d3d4d5d6d7d8gd9d:d; Z%ejd	d
dgejd<d=d>gejd?dd@gdAdB Z&ejd<d=d>gejdCd'd%gd$d%gd'd%ggd d$d$gd d d gd d$d$ggfdDd-gdEd-gdFd*gdEd-ggd d d d d gd d d d d$gd d$d d d ggfgdGdH Z'dIdJ Z(ejd?dKd@dgejdLdKd@dgdMdN Z)ejdOdPdQgejd"d$d'ge!dRdSggdTdU Z*ejdOdPdQgdVdW Z+ejjdXd&d%gd#d%ggd&d#gd%ggej,fe!d$d'gdYd'ggd$dYgd'ggej-fej!d+d,gd.d,gge"d/d+d.gd,ggej,fe!d+d,gd.d,ggd+d.gd,ggej.fe!d$d'gej#d'ggd$ej#gd'ggejfej!d+ej#gdej#gge"d/d+dgej#ggej,fej!d+e$d0gde$d0gge"d/d+dge$d0ggej,fgd1d2d3dZd[d\d]gd9d^d_ Z/ejd	d
dgejjd`ej!d-d*gge"d/j0ej!d-dagge"d/j0d-d*dbggej,fej!d$d'ggdcd/j0ej!d$ddggdcd/j0d$d'dYggej1fej!d-d*gge"d/j0ej!d-dagge"d/j0e!d-d*dbggej,fej!dd-gge"d/j0ej!dd*gge"d/j0dd-degge"fej!d-d*gge"d/j0ej!d-ej#gge"d/j0d-d*degge"fej!d-dgge"d/j0ej!d-ej#gge"d/j0d-ddegge"fej!d-ej#gge"d/j0ej!d-dgge"d/j0d-ej#degge"fgd3d2dfdgdhdidjgd9dkdl Z2dmdn Z3dodp Z4dqdr Z5ejjdsd@dtdugfdKdtdvdugfdbd'd*gdwdxgfgd@dydzgd9d{d| Z6d}d~ Z7ejjd"d&d'd%gd#d$d%gge!d(d'd%gdd$d%ggej!d-d.d,gd*d+d,gge"d/gd1d2d3gd9dd Z8ejjd`ej!d-d*gge"d/j0ej!d-dagge"d/j0d-d*dbggej,fej!d$d'ggdcd/j0ej!d$ddggdcd/j0d$d'dYggej1fej!d-d*gge"d/j0ej!d-dagge"d/j0e!d-d*dbggej,fgd3d2dgd9dd Z9dd Z:dd Z;ejde$e<gdd Z=dd Z>dd Z?dd Z@dd ZAdd ZBdd ZCejdej#de$d0gdd ZDejd?d&dYgd&dYdd-ggdd ZEejjdd>d=gddgd9ejjd?d@d-d'd*ggd@dzgd9dd ZFejde	e
gdd ZGejddd'iddiddid'ddddddgejddd-d*dbdagggdd ZHejd?dKd@d*ggdd ZIejd?d-gdaggdd ZJejdddYiddiddiddiddidYddddddgdd ZKejd?d@d*ggdd ZLejd?d-gdaggdd ZMdd ZNejddYd$ddddigddÄ ZOddń ZPddǄ ZQddɄ ZRdd˄ ZSdd̈́ ZTejddd$dϜgddф ZUejdd'dYdϜgddӄ ZVddՄ ZWejddddddddgejddddgdd ZXdd ZYejdej#dgdd ZZdd Z[ejd	dd
gejdddgdd Z\ejd	d
dgdd Z]ejd	d
dgdd Z^ejd	d
dgdd Z_dd Z`dd Zaejdej#dgdd Zbejdddgejdej#dgdd Zcejjd`ej!d-ej#gge"d/j0ej!d-d*gge"d/j0ej!d-ej#dage"d/gej,fej!d-ej#gge"d/j0ej!d-d*gge"d/j0ej!d-ej#dage"d/gej,fej!dej#ggejd/j0ej!dRggejd/j0e!ddSej#ggejfgd ddgd9dd Zdejde!dej#dRggj0e!dej#dggj0e!dSggfe!ddSdRggj0e!dddggj0e!ej#ggfej!dbej#d*gge"d/j0e!dej#dggj0ej!dagge"d/fej!dbd-d*gge"d/j0e!dddggj0ej!ej#gge"d/fgdd	 Zed
d Zfdd Zgejddd.ggej!dd.ggdd/ej!dd.ggdd/gejdd+d.ggej!d+d.ggdd/ej!d+d.ggdd/gdd Zhdd Zidd Zjdd Zkejdd>d=gdd Zlejdej!d-gdgge"d/d gej#gej#ggejmdgdgdgge"d/fej!ej#gdgd-gge"d/d gej#gej#ggejmdgej#gej#gge"d/fgd d! Znd"d# Zod$d% Zpd&d' Zqd(d) Zrd*d+ ZsejdddYiddiddiddiddidYddddddgd,d- Ztd.d/ Zud0d1 Zvd2d3 Zwd4d5 Zxejdddidd'igd6d7 Zyejddd$idd8igd9d: Zzd;d< Z{d=d> Z|dS (?      Nsparse)NotFittedError)OneHotEncoderOrdinalEncoder)is_scalar_nan)_convert_containerassert_allcloseassert_array_equalc                  C   s   t dddgdddgg} t }tdd}|| }|| }|jdksLt|jdksZtt|shtt|rvtt|	 dd	dd	d	gd	dd	dd	gg t|	 | d S )
N         r   Fsparse_outputr                    ?)
nparrayr   fit_transformshapeAssertionErrorr   issparser
   toarray)XZ
enc_sparseZ	enc_denseX_trans_sparseZX_trans_dense r   j/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/sklearn/preprocessing/tests/test_encoders.py!test_one_hot_encoder_sparse_dense   s    


 r   handle_unknownignoreinfrequent_if_existc                 C   s   t dddgdddgdddgg}t dddgg}tdd}|| tjtdd	 || W 5 Q R X t| d}|| | }t	||
 t d
d
d
d
dd
d
gg t|| d S )Nr   r   r   r      errorr    Found unknown categoriesmatchr   r   )r   r   r   fitpytestraises
ValueError	transformcopyr
   r   r	   r    r   X2ohZ	X2_passedr   r   r   #test_one_hot_encoder_handle_unknown(   s    "



r2   c               	   C   sL   t dgdgg} tddgd}d}tjt|d ||  W 5 Q R X d S )Nab
categorieszqThis OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.r'   )r   r   r   r*   r+   r   r-   )r   encmsgr   r   r   test_one_hot_encoder_not_fitted@   s    r9   c              	   C   s   t ddddgd}t ddgd}t| d}|| | }t|| t ddddgdd	ddgg t|| d S )
NZ11111111Z22Z333Z4444)r   Z55555r%   r   r   )	r   r   reshaper   r)   r.   r
   r-   r   r/   r   r   r   +test_one_hot_encoder_handle_unknown_stringsL   s    

r<   output_dtypeinput_dtypec                 C   s   t jddgg| dj}t jddgddgg|d}td|d}t|| | t||| | td|dd}t||| t|||| d S )Nr   r   dtypeauto)r6   r@   F)r6   r@   r   )	r   asarrayTr   r
   r   r   r)   r-   )r>   r=   r   
X_expectedr1   r   r   r   test_one_hot_encoder_dtype_   s    rE   c                 C   s   t d}|ddgddgd}tjddddgddddgg| d}t| d}t|| | t|	|
| | t| d	d
}t||| t|	|
|| d S )Npandasr3   r4   r   r   ABr   r?   F)r@   r   )r*   importorskip	DataFramer   r   r   r
   r   r   r)   r-   )r=   pdX_dfrD   r1   r   r   r   !test_one_hot_encoder_dtype_pandasn   s    
"
rN   c                  C   s   t  } dddddgdddddgdd	d
ddgdddddgg}| | |  }tdddddddddddddddg| | ddd d!d"g}td#d$d%d&d'd(d)d*d+d,d-d.d/d0d1g| tjtd2d3 | ddg W 5 Q R X d S )4NMaler   girlr   r   Female)   
   3   boy   [         Z	x0_FemaleZx0_MaleZx1_1Zx1_41Zx1_51Zx1_91Zx2_boyZx2_girlZx3_1Zx3_2Zx3_12Zx3_21Zx4_3Zx4_10Zx4_30onetwothreefourfiveZ
one_FemaleZone_MaleZtwo_1Ztwo_41Ztwo_51Ztwo_91Z	three_boyZ
three_girlZfour_1Zfour_2Zfour_12Zfour_21Zfive_3Zfive_10Zfive_30z!input_features should have lengthr'   )r   r)   get_feature_names_outr
   r*   r+   r,   )r7   r   feature_namesZfeature_names2r   r   r   "test_one_hot_encoder_feature_names~   sb    
ra   c                  C   s\   t  } tjddggtdj}| | |  }tddg| | jdgd}tdd	g| d S )
Nu   c❤t1Zdat2r?   u	   x0_c❤t1Zx0_dat2u   n👍meZinput_featuresu   n👍me_c❤t1u   n👍me_dat2)r   r   r   objectrC   r)   r_   r
   )r7   r   r`   r   r   r   *test_one_hot_encoder_feature_names_unicode   s    
rd   c               	   C   s   dd } t | d}tjddggtdj}|| | }tddg| |jd	gd
}tddg| dd }t |d|}d}tj	t
|d |  W 5 Q R X dS )z=Check the behaviour of `feature_name_combiner` as a callable.c                 S   s   | d t | S )N_)reprfeaturecategoryr   r   r   name_combiner   s    zHtest_one_hot_encoder_custom_feature_name_combiner.<locals>.name_combiner)Zfeature_name_combinerNoneNr?   z	x0_'None'Zx0_Noner3   rb   za_'None'Za_Nonec                 S   s   dS )Nr   r   rg   r   r   r   wrong_combiner   s    zItest_one_hot_encoder_custom_feature_name_combiner.<locals>.wrong_combinerzMWhen `feature_name_combiner` is a callable, it should return a Python string.r'   )r   r   r   rc   rC   r)   r_   r
   r*   r+   	TypeError)rj   r7   r   r`   rl   err_msgr   r   r   1test_one_hot_encoder_custom_feature_name_combiner   s    

ro   c                  C   s   t ddggj} t }|jddddggd | d ddddggksLt||  j	dksdt|jdddddggd ||  j	d	kstd S )
Nr   r   r   r   r5   r6   )r   r#   r#   r   )
r   r   rC   r   
set_params
get_paramsr   r   r   r   )r   r1   r   r   r   test_one_hot_encoder_set_params   s    rr   c                 C   sX   t dd}|| }t ddd}|| }t| | t|rL|jdksPt| S )NrA   r5   Fr6   r   Zcsr)r   r   r	   r   r   r   formatr   )r   r7   ZXtr1ZXtr2r   r   r   check_categorical_onehot   s    


ru   r   defr   7   abcr   rS   r   r4   rH   catr3   rI   r?   nanmixednumericrc   z	mixed-nanzmixed-float-nanz
mixed-Nonezmixed-None-nanzmixed-None-float-nan)Zidsc                 C   s   t t| d d dgf }t|ddgddgg t t| d d ddgf }t|ddddgddddgg tdd| }t| dddddgdddddgg d S )Nr   r   rA   r5   )ru   r   r   r	   r   r   r   )r   Xtrr   r   r   test_one_hot_encoder   s    r~   sparse_FTdropfirstc              	   C   s  dddgdddgdddgg}t ||d}||}tj|td}t||| ddgddgddgg}t |d	|d
}||}t|}t||| |d krdddgdddgdddgg}t || ddgddgdddggd}||}tj|td}d |d< t||| ddgddgddgg}t |ddgddgg| d}||}tj|td}d |d< d |d d df< t||| tdddgdddgg}td}t	j
t|d || W 5 Q R X d S )Nrx   r   rw   rv   r   r   r   r   r?   rA   )r   r6   r   6   8   )r   r    r6   )r   r   )r   r6   r    r   r   r   )Shape of the passed X data is not correctr'   )r   r   r   r   rc   r
   inverse_transformreescaper*   r+   r,   )r    r   r   r   r7   X_trexpr8   r   r   r   test_one_hot_encoder_inverse  sH    






r   z
X, X_transrZ   r[   r\   c              	   C   sJ   t |d| }d}|r"t|d}tjt|d || W 5 Q R X dS )zCheck that `inverse_transform` raise an error with unknown samples, no
    dropped feature, and `handle_unknow="error`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/14934
    r   zqSamples \[(\d )*\d\] can not be inverted when drop=None and handle_unknown='error' because they contain all zerosr   r'   N)r   r)   r   r*   r+   r,   r   )r   X_transr   r7   r8   r   r   r   ?test_one_hot_encoder_inverse_transform_raise_error_with_unknownK  s    
r   c                  C   sJ   t jddgddgddggtd} tddd	}|| }t|||  d S )
NrO   r   rQ   r   r   r?   	if_binaryFr   r   )r   r   rc   r   r   r
   r   )r   oher   r   r   r   &test_one_hot_encoder_inverse_if_binaryk  s     
r   r   
reset_dropc                 C   s   t jddgddgddggtd}t| dd}|| ||}| }|j|d	 t|	|| t
||| t| | d S )
NrO   r   rQ   r   r   r?   Fr   r   )r   r   rc   r   r)   r-   r_   rp   r
   r   r	   )r   r   r   r   r   r`   r   r   r   test_one_hot_encoder_drop_resetr  s     

r   methodr)   r         @      @c              	   C   s6   t  }d}tjt|d t|||  W 5 Q R X d S )N'Expected 2D array, got 1D array insteadr'   )r   r*   r+   r,   getattr)r   r   r1   r8   r   r   r   test_X_is_not_1D  s    r   c              	   C   sR   t d}|ddddg}t }d}t jt|d t|| | W 5 Q R X d S )NrF      r   r#   r   r'   )r*   rJ   Seriesr   r+   r,   r   )r   rL   r   r1   r8   r   r   r   test_X_is_not_1D_pandas  s    
r   zX, cat_exp, cat_dtyper   stringzmissing-floatzmissing-np.nan-objectzmissing-float-nan-objectc                 C   s   | | d d d fD ]}t dd}|| t|jts:tt|j|D ]l\}}| }t|d rt|d srt|d d |d d kstn| |kstt	
|j|sFtqFqd S )Nr:   rA   r5   )r   r)   
isinstancecategories_listr   ziptolistr   r   
issubdtyper@   )r   Zcat_exp	cat_dtypeXir7   resr   Zres_listr   r   r   test_one_hot_encoder_categories  s    #

r   zX, X2, cats, cat_dtypedcint64r#   zzobject-stringzobject-string-nonezobject-string-nanzobject-None-and-nanzobject-nan-and-Nonec              	   C   s   t |d}tdddgdddgg}t||  | t|jd t|d ksVt|j	d 
 t|d kstt|j	d j|kstt |d}tjtdd || W 5 Q R X t ||d}tdddgdddgg}t||| | d S )Nr5   r   r   r   r&   r'   r6   r    )r   r   r   r
   r   r   r   r6   r   r   r   r@   r*   r+   r,   r)   r-   )r   r0   catsr   r    r7   r   r   r   r   )test_one_hot_encoder_specified_categories  s    :

r   c               	   C   sD  t jddggtdj} tdddggd}t dddgdddgg}t|| |  | t|	|  | |j
d  dddgkstt |j
d jt jstt d	d
ggj} td
d	dggd}d}tjt|d |	|  W 5 Q R X t d	d
t jggj} td	t jd
ggd}tjt|d |	|  W 5 Q R X d S )Nr3   r4   r?   r   r5   r   r   r   r   r   r   z%Unsorted categories are not supportedr'   )r   r   rc   rC   r   r
   r)   r-   r   r   r   r   r   r   r@   object_r*   r+   r,   rz   )r   r7   r   r8   r   r   r   (test_one_hot_encoder_unsorted_categories  s     r   c               	   C   s   t jddgddggtdj} tdddgdddggd}t d	d
d
d	d
d
gd
d	d
d
d
d	gg}t||  | |jd 	 dddgkst
t |jd jt jst
|jd 	 dddgkst
t |jd jt jst
d S )Nr3   r4   r   r   r?   r   r   r5   r   r   )r   r   rc   rC   r   r
   r   r   r   r   r   r   r@   r   r   r7   r   r   r   r   7test_one_hot_encoder_specified_categories_mixed_columns-  s    &r   c                  C   sL   t d} | ddgddgd}t|}t|ddddgddddgg d S )NrF   r3   r4   r   r   rG   r   )r*   rJ   rK   ru   r	   )rL   rM   r}   r   r   r   test_one_hot_encoder_pandas:  s    
r   zdrop, expected_namesx0_cZx2_bZx1_2x0_bZx2_abinaryZmanualc                 C   s>   dddgdddgg}t | d}|| | }t|| d S )Nr   r   r3   r4   r   )r   r)   r_   r
   )r   Zexpected_namesr   r   r`   r   r   r   'test_one_hot_encoder_feature_names_dropC  s
    


r   c                  C   s   ddgddgddgg} t ddddgddddgddddgg}t d dg}td	d
d}|| }t|j| t|| ddgddgddgg} t ddgddgddgg}t dd g}td	d
d}|| }t|j| t|| d S )NrS   yes   norY   r   r   r   r   Fr   truer3   false)r   r   r   r   r
   	drop_idx_r	   )r   expectedZexpected_drop_idxr   resultr   r   r   *test_one_hot_encoder_drop_equals_if_binaryU  s      


r   r   c                 C   sX   t  }tjdddgdddggdd}t|| |d t dd}t|| | d S )Nr   r   r   r?   float64)r   r   r   r
   r   Zastyper   r   r   r   test_ordinal_encoderm  s
    

r   zobject-string-catc              	   C   s   t |d}tdgdgg}t|| | t|jd t|d ksJt|jd 	 t|d ksht|jd j
|ks|tt |d}tjtdd || W 5 Q R X d S )Nr5   r   r   r   r&   r'   )r   r   r   r
   r   r   r6   r   r   r   r@   r*   r+   r,   r)   )r   r0   r   r   r7   r   r   r   r   )test_ordinal_encoder_specified_categories~  s    

r   c               	   C   s   dddgdddgg} t  }|| }tj| td}t||| tddddgddddgg}td}t	j
t|d	 || W 5 Q R X d S )
Nrx   r   rw   rv   r   r?   r   r   r'   )r   r   r   r   rc   r
   r   r   r   r*   r+   r,   )r   r7   r   r   r8   r   r   r   test_ordinal_encoder_inverse  s    

r   c                  C   s   t ddd} tjddgddgdd	ggtd
}tjddgddgddggtd
}| | | |}tjddgddgddggdd
}t|| | |}tjdd gd dgddggtd
}t|| d S )Nuse_encoded_valuer    unknown_valuer3   xr4   yr   r   r?   ZxyZblar   r   r   r   )r   r   r   rc   r)   r-   r
   r   )r7   X_fitr   X_trans_encr   X_trans_invinv_expr   r   r   +test_ordinal_encoder_handle_unknowns_string  s      

 

 r   r@   c                 C   s   t ddd}tjddgddgdd	gg| d
}tjddgddgddgg| d
}|| ||}tjddgddgddggdd
}t|| ||}tjdd gd dgddggtd
}t|| d S )Nr   r   r      r      r   	   r?   rV      r   r   )r   r   r   r)   r-   r
   r   rc   )r@   r7   r   r   r   r   r   r   r   r   r   ,test_ordinal_encoder_handle_unknowns_numeric  s      

 

 r   c                  C   s`   t dtjd} tdgdgdgg}| | | dgdgdgg}t|dgdgtjgg d S )Nr   r   r   r   r   r#   r   )r   r   rz   r   r)   r-   r
   )r7   r   r   r   r   r   (test_ordinal_encoder_handle_unknowns_nan  s
    
r   c               	   C   sN   t dtjtd} tdgdgdgg}tjtdd | | W 5 Q R X d S )Nr   )r    r   r@   r   r   r   z'dtype parameter should be a float dtyper'   )	r   r   rz   intr   r*   r+   r,   r)   )r7   r   r   r   r   8test_ordinal_encoder_handle_unknowns_nan_non_float_dtype  s      r   c               	   C   s\   t jdddddggtdj} dddg}t|d}d}tjt|d ||  W 5 Q R X d S )NZLowZMediumZHighr?   r5   z*Shape mismatch: if categories is an array,r'   )	r   r   rc   rC   r   r*   r+   r,   r)   )r   r   r7   r8   r   r   r   +test_ordinal_encoder_raise_categories_shape  s    

r   c                     s  t ddtjddddgddddggdd} tjddgd	d
ggddtjddgd	d
ggddtddgddggtddgddggtjddgd	dggddfD ]B   t fddtdD stt 	 |  qddgd	d
gg   tfddtdD stt 	 |  ddgd	dgg   tfddtdD sltt 	 |  d S )NrA   r5   r   r   r   r?   r   r   r   r#   r   r3   r4   r   r      a   b   c   drc   c                    s   g | ]}j | j jkqS r   r   r@   .0ir   r7   r   r   
<listcomp>  s     z'test_encoder_dtypes.<locals>.<listcomp>c                    s"   g | ]}t  j| jt jqS r   )r   r   r   r@   integerr   r7   r   r   r     s     c                    s   g | ]} j | jd kqS )rc   r   r   r   r   r   r     s     )
r   r   r   r)   allranger   r
   r-   r   )r   r   r   r   test_encoder_dtypes  s&    
"
 
 
 r   c                     s  t d} tddtjddddddgddddddggdd}| jdd	gd
dgddgddd}| tfddtd	D st	t
| | | dd	gddgddgd}|d j|d j|d jg | t fddtd
D st	t
| | d S )NrF   rA   r5   r   r   r   r?   r   r   r   r#   r   r   )rH   rI   Cr   c                    s   g | ]} j | jd kqS )r   r   r   r   r   r   r     s     z.test_encoder_dtypes_pandas.<locals>.<listcomp>r3   r4   r   r   rH   rI   r   c                    s    g | ]}j | j | kqS r   r   r   ZX_typer7   r   r   r   "  s     )r*   rJ   r   r   r   rK   r)   r   r   r   r
   r-   r   r@   )rL   r   r   r   r   r   test_encoder_dtypes_pandas  s    

"

"r   c                  C   s*   t  } ddgddgg}tj| j| d S )NrO   r   rQ   r   )r   r   testingZassert_no_warningsr   )r7   r   r   r   r   test_one_hot_encoder_warning&  s    r   missing_valuec           	      C   s  dddd| g}t |d}ddddd	gddd
dd	gdddd| gg}|| }d
dd
d
d
gdd
dd
d
gdddddgg}t|| |j|kstdd t|j|jD }|	|}t
j|td}t|d rzt|d d |d d  t|d stt|d stt|d d d df |d d d df  t|dd df |dd df  t|d sftt|d stnt|| t|| d S )Nrv   rV   r   r   r   rx   r   rw   r3   r   r   c                 S   s   g | ]\}}|| qS r   r   )r   ry   rh   r   r   r   r   :  s    z4test_one_hot_encoder_drop_manual.<locals>.<listcomp>r?   r:   )r:   r:   )r   r   r   r
   r   r   r   r   r   r   r   r   rc   r   )	r   Zcats_to_dropr7   r   Ztransr   Zdropped_catsZX_inv_transZX_arrayr   r   r    test_one_hot_encoder_drop_manual,  s2    
(

*"
r   rR   c              	   C   sN   t | d}d}tjt|d( |dddgdddgdd	d
gg W 5 Q R X d S )Nr   z-`drop` should have length equal to the numberr'   rx   r   rw   rv   r   r   ;   )r   r*   r+   r,   r)   )r   r7   rn   r   r   r   test_invalid_drop_lengthQ  s    
r   densityr   Zdensec                 C   s   t | d}t | |d}dddgdddgg}|| || t|j|j |dkrbt|jd	 n0t||j|jD ]\}}}|t| |ksrtqrt|jt	j
st|jjtkstd S )
Nr   r   r   r   r3   r   r4   r   r   )r   r)   r
   r   r   r   r   r   r   r   Zndarrayr@   rc   )r   r   Zohe_baseZohe_testr   Zdrop_catZdrop_idxZcat_listr   r   r   test_categoriesY  s     


  r   Encoderc                 C   s   d|    d kstd S )NcategoricalZX_types)Z	_get_tagsr   )r   r   r   r   "test_encoders_has_categorical_tagsm  s    r   kwargsmax_categoriesmin_frequency   g(\?r   )r   r   rV   r6   rA   c           
      C   s   t dgd dgd  dgd  dgd  gj}tf |d	d
d| |}t|jdddgg dgdgdgdgdgg}t ddgddgddgddgddgg}||}t|| dd dgdgd  D }|	|}t|| |
 }	tddg|	 dS )zpTest that different parameters for combine 'a', 'c', and 'd' into
    the infrequent category works as expected.r3   r   r4   r   r   rS   r   r   r"   F)r6   r    r   er   r   c                 S   s   g | ]
}|gqS r   r   r   colr   r   r   r     s     z2test_ohe_infrequent_two_levels.<locals>.<listcomp>infrequent_sklearnr#   r   x0_infrequent_sklearnNr   r   rC   r   r)   r
   infrequent_categories_r-   r	   r   r_   )
r   r6   X_trainr   X_testr   r   expected_invX_invr`   r   r   r   test_ohe_infrequent_two_levelsr  s(    2(



r  c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d|}|jd |jd  dksdtt dgdgg}||}t	dgdgg| |
 }tdg| ||}tdgdgg| dS )z3Test two levels and dropping the frequent category.r3   r   r4   r   r   rS   r   r   r"   Fr   r    r   r   r   r   r   r  r   N)r   r   rC   r   r)   r   r   r   r-   r	   r_   r
   r   )r   r  r   r  r   r`   	X_inverser   r   r   ,test_ohe_infrequent_two_levels_drop_frequent  s"    2

r  c              	   C   sz   t dgd dgd  dgd  dgd  gj}td	d
d| d}d| d d}tjt|d || W 5 Q R X dS )z_Test two levels and dropping any infrequent category removes the
    whole infrequent category.r3   r   r4   r   r   rS   r   r   r"   Fr   r	  Unable to drop category r   ( from feature 0 because it is infrequentr'   Nr   r   rC   r   r*   r+   r,   r)   r   r  r   r8   r   r   r   5test_ohe_infrequent_two_levels_drop_infrequent_errors  s    2r  r   gQ?g{Gz?r   c           	   	   C   s  t dgd dgd  dgd  dgd  gj}tf d	d
d| |}t|jddgg dgdgdgdgdgg}t dddgdddgdddgdddgdddgg}||}t|| dgdgdgdgdgg}|	|}t|| |
 }tdddg| dS )zkTest that different parameters for combing 'a', and 'd' into
    the infrequent category works as expected.r3   r   r4   r   r   rS   r   r   r"   Fr    r   r   r   r   r   r   r   r  Nr  )	r   r  r   r  r   r   r  r  r`   r   r   r    test_ohe_infrequent_three_levels  s0    2 2



r  c              	   C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d|}t dgdgdgg}tddgddgddgg|| |jdd| d}tj	t
|d |dgdgg}W 5 Q R X tddgddgg| dS )z5Test three levels and dropping the frequent category.r3   r   r4   r   r   rS   r   r   r"   Fr	  r   r   r!   r%   r&   r'   r   N)r   r   rC   r   r)   r	   r-   rp   r*   warnsUserWarning)r   r  r   r  r8   r   r   r   r   .test_ohe_infrequent_three_levels_drop_frequent  s     2"r  c              	   C   sz   t dgd dgd  dgd  dgd  gj}td	d
d| d}d| d d}tjt|d || W 5 Q R X dS )z7Test three levels and dropping the infrequent category.r3   r   r4   r   r   rS   r   r   r"   Fr	  r  r   r  r'   Nr  r  r   r   r   7test_ohe_infrequent_three_levels_drop_infrequent_errors  s    2r  c               	   C   s   t dgd dgd  dgd  dgd  gj} td	d
dd| }t|jddgg dgdgdgdgg}t dddgdddgdddgdddgg}||}t|| dgg}d}t	j
t|d || W 5 Q R X dS )zmTest that different parameters for combining 'a', and 'd' into
    the infrequent category works as expected.r3   r   r4   r   r   rS   r   r   r$   F)r    r   r   r   r   badz.Found unknown categories \['bad'\] in column 0r'   N)r   r   rC   r   r)   r
   r  r-   r	   r*   r+   r,   )r  r   r  r   r   r8   r   r   r   (test_ohe_infrequent_handle_unknown_error  s"    2  *

r  c                 C   s   t jdgd dgd  gtdj}tf ddddggd	d
d| |}dgdgdgdgdgg}t ddgddgddgddgddgg}||}t|| dddgg}dgdgg}|D ].}|j|d| tdgdgg|| qdS )zG'a' is the only frequent category, all other categories are infrequent.r3   r   r   rY   r?   r   r   r4   Fr"   r6   r   r    r   r   r   r   r   N)	r   r   rc   rC   r   r)   r-   r	   rp   )r   r  r   r  r   r   Zdropsr   r   r   r   5test_ohe_infrequent_two_levels_user_cats_one_frequent,  s&    "(

r  c                  C   s   t jdgd dgd  dgd  dgd  gtd	j} tddddggd
ddd| }t|jdddgg dgdgdgdgdgg}t ddgddgddgddgddgg}||}t	|| dd dgdgd  D }|
|}t|| dS )zFTest that the order of the categories provided by a user is respected.r3   r   r4   r   r   rS   r   r   r?   Fr"   r   r6   r   r    r   r   r   r   c                 S   s   g | ]
}|gqS r   r   r   r   r   r   r   ^  s     z<test_ohe_infrequent_two_levels_user_cats.<locals>.<listcomp>r   r#   Nr   r   rc   rC   r   r)   r
   r  r-   r	   r   r  r   r  r   r   r  r  r   r   r   (test_ohe_infrequent_two_levels_user_catsH  s(    ( (


r  c               	   C   s   t jdgd dgd  dgd  dgd  gtd	j} tddddggd
ddd| }t|jddgg dgdgdgdgdgg}t dddgdddgdddgdddgdddgg}||}t	|| dgdgdgdgdgg}|
|}t|| dS )zTest that the order of the categories provided by a user is respected.
    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.r3   r   r4   r   r   rS   r   r   r?   Fr"   r  r   r   r   r   Nr  r  r   r   r   *test_ohe_infrequent_three_levels_user_catsc  s2    ( 2


r  c                  C   s   t jdddddddddg	dddddddddg	f } tdddd}||  ddgddgg}||}t|ddddgddddgg dS )	zaTest infrequent categories where feature 0 has infrequent categories,
    and feature 1 does not.r   r   r   r   r   F)r   r   r   N)r   c_r   r)   r-   r	   )r   r   r  r   r   r   r   test_ohe_infrequent_mixed  s    2

r!  c            	      C   s  t jdddddddddg	dddddddddg	dddddddddg	f } tdddd	}||  }t|jd ddg t|jd ddg t|jd d
 | }tddddddddg| ddddddddgddddddddgddddddddgddddddddgddddddddgddddddddgddddddddgddddddddgddddddddgg	}t|| dddgdddgg}|	|}ddddddddgddddddddgg}t||  |
|}t jddd
gddd
ggtd}t|| tdddd	| }tjtdd |	| W 5 Q R X dddgdddgg}|	|}ddddddddgddddddddgg}t||  |
|}t jdddgdddggtd}t|| d
S )z?Test infrequent categories with feature matrix with 3 features.r   r   r   r   r   rS   rA   r"   r6   r   r    NZx0_0Zx0_3r  Zx1_0Zx1_5Zx1_infrequent_sklearnZx2_0Zx2_1r#   r   r?   r$   r&   r'   )r   r   r   r   r   r
   r  r_   r	   r-   r   r   rc   r)   r*   r+   r,   )	r   r   r   r`   r   r  X_test_transr  r  r   r   r   'test_ohe_infrequent_multiple_categories  s      


(
 
  
(
r$  c            	      C   s(  t d} | jdddddddddg	dddd	d	d
dddg	dddgd}tdddd}|| }t|jd ddg t|jd ddd
g ddddddgddddddgddddddgddddddgddddddgddddddgddddddgddddddgddddddgg	}t|| | jddgdd
gdddgd}ddddddgddddddgg}|	|}t||  |
|}tjddgddggtd}t|| | jddgd
dgdddgd}|	| }ddddddgddddddgg}t|| |
|}tjddgddggtd}t|| dS )zHTest infrequent categories with a pandas dataframe with multiple dtypes.rF   r3   fr   r4   r   r   r   rS   rV   )strr   r&  r   columnsrA   r"   r"  r      r   r?   N)r*   rJ   rK   r   r   r   r
   r  r	   r-   r   r   r   rc   )	rL   r   r   r   r   r  r#  r  r  r   r   r   .test_ohe_infrequent_multiple_categories_dtypes  s\    
  	
  


  

 r*  rX   )r   r   c                 C   sp   t dgd dgd  dgd  dgd  gj}tf d	d
d| }|| |dgg}t|dgg dS ),All user provided categories are infrequent.r3   r   r4   r   r   rS   r   r   r"   Fr  r   N)r   r   rC   r   r)   r-   r	   r   r  r   r   r   r   r   $test_ohe_infrequent_one_level_errors5  s    2 
r-  c                 C   sf   t jdgd gtdj}tf ddddggdd	d
| |}|dgdgg}t|dgdgg dS )r+  r   r   r?   r   r   r3   r4   Fr"   r  r   N)r   r   rc   rC   r   r)   r-   r	   r,  r   r   r   5test_ohe_infrequent_user_cats_unknown_training_errorsC  s    r.  c               	   C   sH   ddgddgddgg} d}t jt|d tdd	|  W 5 Q R X d S )
NrO   r   rQ   r   r   z'`sparse` was renamed to `sparse_output`r'   Fr   )r*   r  FutureWarningr   r)   )r   r8   r   r   r   &test_one_hot_encoder_sparse_deprecatedT  s    r0  zinput_dtype, category_dtypeZOOZOUZUOZUUSOZSUZSS
array_typer   r   Z	dataframec           
      C   s   t jdgdgg| d}t jddg|dg}t|dd|}tdgdgdgdgg|| d}||}t ddgddgddgddgg}t|| t|d|}	|	|}t dgdgdgdgg}t|| d	S )
a"  Check that encoding work with object, unicode, and byte string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/15616
    https://github.com/scikit-learn/scikit-learn/issues/15726
    https://github.com/scikit-learn/scikit-learn/issues/19677
    r4   r3   r?   Frs   r   r   r5   N)	r   r   r   r)   r   r-   r	   r   r
   )
r>   Zcategory_dtyper2  r   r6   r   r  r   r   oer   r   r   test_encoders_string_categories]  s      
"

r4  c               	   C   sh   t jdgdggdd} t jddgddg}t|dd}td}tjt|d	 ||  W 5 Q R X d
S )zCheck that this mixture of predefined categories and X raises an error.

    Categories defined as bytes can not easily be compared to data that is
    a string.
    r4   r3   Ur?   SFrs   zjIn column 0, the predefined categories have type 'bytes' which is incompatible with values of type 'str_'.r'   N)	r   r   r   r   r   r*   r+   r,   r)   )r   r6   r   r8   r   r   r   $test_mixed_string_bytes_categoricals|  s    r7  c                 C   sP   t jdd| d| ggtdj}tddd|}| }t|ddd	|  g d S )
Nr3   r4   r?   Fr!   r   r    Zx0_ar   Zx0_)r   r   rc   rC   r   r)   r_   r
   )r   r   r   namesr   r   r   )test_ohe_missing_values_get_feature_names  s    r:  c                  C   s   t d} | jddd dgtjdddtjgtddd	d
gd}tdddddddgdddddddgdddddddgdddddddgg}t|}t|| d S )NrF   dogry   r   r   r#   r?   )col1col2r<  r=  r'  r   )	r*   rJ   rK   r   r   rz   floatru   r	   )rL   dfexpected_df_transr}   r   r   r   %test_ohe_missing_value_support_pandas  s     

	rA  pd_nan_typepd.NAznp.nanc              
   C   s   t d}| dkr|jntj}|d|jdd|ddgddi}td	d	d
d	gd
d	d	d	gd	d	d	d
gd	d
d	d	gd
d	d	d	gg}td|d}|	|}t
|| t|jd
kstt|jd	 d d dddg t|jd	 d std S )NrF   rC  r<  r   r3   r4   ri   r?   r   r   Fr8  r:   )r*   rJ   NAr   rz   rK   r   r   r   r   r	   lenr   r   r
   isnan)rB  r    rL   pd_missing_valuer?  r@  r   df_transr   r   r   1test_ohe_missing_value_support_pandas_categorical  s*    
 







rI  c              	   C   s   ddgddgddgg}t dd| d}||}tdddgdddgdddgg}t|| d	d
gg}tdddgg}d}tjt|d ||}W 5 Q R X t|| |	|}t
|tjddggtd dS )zZCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during transform.r3   r   r4   r   r   r   Fr   r   r    r   r   tFound unknown categories in columns \[0, 1\] during transform. These unknown categories will be encoded as all zerosr'   r?   Nr   r   r   r   r	   r*   r  r  r-   r   r
   rc   r    r   r   r   rD   r  warn_msgr  r   r   r   /test_ohe_drop_first_handle_unknown_ignore_warns  s.      




rO  c              	   C   s   ddgddgddgg}t dd| d}||}tddddgddddgddddgg}t|| d	d
gg}tddddgg}d}tjt|d ||}W 5 Q R X t|| |	|}t
|tjddggtd dS )zDCheck drop='if_binary' and handle_unknown='ignore' during transform.r3   r   r4   r   r   r   FrJ  r   r   rK  r'   Nr?   rL  rM  r   r   r   3test_ohe_drop_if_binary_handle_unknown_ignore_warns  s.      







rP  c              	   C   s   ddgddgddgg}t dd| ddgddggd}|| d	dgg}tddgg}d
}tjt|d ||}W 5 Q R X t|| dS )znCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during fit with categories passed in.r3   r   r4   r   r   r   F)r   r   r    r6   r   zqFound unknown categories in columns \[0\] during transform. These unknown categories will be encoded as all zerosr'   N)	r   r)   r   r   r*   r  r  r-   r	   )r    r   r   r  rD   rN  r   r   r   r   'test_ohe_drop_first_explicit_categories  s    

rQ  c               	   C   s   t d} | jdddgdddgdddgd	}td
d}|jdd d}t jt|d || W 5 Q R X || t jt|d |	| W 5 Q R X dS )zJRaise informative error message when pandas output and sparse_output=True.rF   r   r   r   r   r4   )r3   r4   r3   r'  Tr   r-   zxPandas output does not support sparse data. Set sparse_output=False to output pandas DataFrames or disable pandas outputr'   N)
r*   rJ   rK   r   
set_outputr+   r,   r   r)   r-   )rL   r?  r   r8   r   r   r   'test_ohe_more_informative_error_message7  s    
$

rT  c               	   C   sX   t t jdddggj} tt jd}dt j }tjt|d |	|  W 5 Q R X dS )zDTest ordinal encoder with nan passthrough fails when dtype=np.int32.r   r   r?   zdThere are missing values in features \[0\]. For OrdinalEncoder to encode missing values with dtype: r'   N)
r   r   rz   rC   r   int32r*   r+   r,   r)   )r   r3  r8   r   r   r   Btest_ordinal_encoder_passthrough_missing_values_float_errors_dtypeK  s    
rV  encoded_missing_valuer   c                 C   s   t jt jdddggt jdj}t| d|}t|jdks@t	t
|jd ddt jg ||}t
|| gdgdgdgg ||}t
|| dS )	z.Test ordinal encoder with nan on float dtypes.r   r   r?   rW  r   r   r   N)r   r   rz   r   rC   r   r)   rE  r   r   r	   r-   r   )rW  r   r3  r   r
  r   r   r   5test_ordinal_encoder_passthrough_missing_values_floatY  s    

rY  c              	   C   s$  t d}| dkr|jntj}|d|jdd|ddgddi}t|d	|}t	|j
d
ksbtt|j
d dd dddg t|j
d d st||}t|dgdg|gdgdgg ||}|jdkstt|dddf ddg t|dddf ddg t|d s tdS )z0Check ordinal encoder is compatible with pandas.rF   rC  r<  r   r3   r4   ri   r?   rX  r   r   Nr   r:          @r   r   )r   r   r   r   )r*   rJ   rD  r   rz   rK   r   r   r)   rE  r   r   r
   rF  r-   r	   r   r   )rB  rW  rL   rG  r?  r3  rH  r
  r   r   r   =test_ordinal_encoder_missing_value_support_pandas_categoricalk  s$    
 

r[  rZ  zobject-None-missing-valuezobject-nan-missing_valueznumeric-missing-valuec              	   C   st   t |d}tdgtjgg}t|| | |jd j|ksBtt |d}t	j
tdd || W 5 Q R X dS )z.Test ordinal encoder for specified categories.r5   r   r   r&   r'   N)r   r   r   rz   r
   r   r   r@   r   r*   r+   r,   r)   )r   r0   r   r   r3  r   r   r   r   =test_ordinal_encoder_specified_categories_missing_passthrough  s    &

r\  zX, expected_X_trans, X_testr   r   c                 C   s8   t ddd}|| }t|| t||dgg dS )z>Test the interaction between missing values and handle_unknownr   r:   r   g      N)r   r   r	   r-   )r   Zexpected_X_transr  r3  r   r   r   r   /test_ordinal_encoder_handle_missing_and_unknown  s    

r]  c               	   C   s   t dddgdddgg} t| }t }d}tjt|d || W 5 Q R X tjt|d |	| W 5 Q R X |	| }t|}tjt|d |
| W 5 Q R X dS )zCheck that we raise proper error with sparse input in OrdinalEncoder.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19878
    r   r   r   r   z6A sparse matrix was passed, but dense data is requiredr'   N)r   r   r   Z
csr_matrixr   r*   r+   rm   r)   r   r   )r   ZX_sparseencoderrn   r   r   r   r   r   test_ordinal_encoder_sparse  s    


r_  c               	   C   s   t ddddddgddt jf } tdddggddd	}||  tdddggd
d}tjtdd ||  W 5 Q R X dS )zCheck OrdinalEncoder.fit works with unseen category when
    `handle_unknown="use_encoded_value"`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    r   r   r   r   Nr:   r   r   )r6   r    r   r$   r   r&   r'   )r   r   Znewaxisr   r)   r*   r+   r,   )r   r3  r   r   r   -test_ordinal_encoder_fit_with_unseen_category  s    $
  
r`  r  ZAAOr5  r  c                 C   s4   t ddd}||  ||}t|ddgg dS )zChecks that `OrdinalEncoder` transforms string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    r   ir   r   N)r   r)   r-   r	   )r  r  r7   r   r   r   r   1test_ordinal_encoder_handle_unknown_string_dtypes  s    

rb  c                  C   sf   t ddddgdd} t | }t|jt j| ddj |	| }t|dgd	gd
gdgg dS )zCheck that `OrdinalEncoder` accepts Python integers that are potentially
    larger than 64 bits.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/20721
    l   	HP
1& l   	H]viel   	 :?i}Ga l   IRK2e6kr:   r   r   )Zaxisr   r   N)
r   r   r;   r   r)   r
   r   sortrC   r-   )r   r^  r   r   r   r   #test_ordinal_encoder_python_integer'  s     
rd  c                  C   sL   t d} dddg}| jdddgg|d}t |}| }t|| d	S )
z-Check feature names out is same as the input.rF   r4   r   r3   r   r   r   r'  N)r*   rJ   rK   r   r)   r_   r
   )rL   r9  r   r7   Zfeature_names_outr   r   r   .test_ordinal_encoder_features_names_out_pandas;  s    

re  c                  C   s   t jdgdgt jggtd} tdt jdd| }|| }t|dgdgdgg t jd	gt jggtd}||}t|t jgdgg ||}|d d d
kst	t 
|d d st	d
S )zECheck interactions between encode_unknown and missing value encoding.r3   r4   r?   r   r    r   rW  r   r   r   N)r   r   rz   rc   r   r)   r-   r	   r   r   rF  )r   r3  r   r  r#  X_roundtripr   r   r   0test_ordinal_encoder_unknown_missing_interactionG  s     


ri  with_pandasc              	   C   s   t jddgddgdt jggtd}d}| rPtd}|j|d	d
gd}|d }n|d }tdd}tjt	|d |
| W 5 Q R X dS )zXCheck OrdinalEncoder errors when encoded_missing_value is used by
    an known category.r3   r;  r4   ry   r   r?   zTencoded_missing_value \(1\) is already used to encode a known category in features: rF   letterZpetr'  z	\['pet'\]z\[1\]r   rX  r'   N)r   r   rz   rc   r*   rJ   rK   r   r+   r,   r)   )rj  r   	error_msgrL   r3  r   r   r   0test_ordinal_encoder_encoded_missing_value_errore  s    "


rm  z4X_train, X_test_trans_expected, X_roundtrip_expected1c                 C   s   t dtjtjd| }tdgtjgdgg}||}t|| ||}|jd }t	|D ]V}||df }	||df }
|	dkr|
dkst
q`t|	rt|
st
q`|
|	ks`t
q`dS )znCheck transform when unknown_value and encoded_missing_value is nan.

    Non-regression test for #24082.
    r   rg  rn  r4   r   N)r   r   rz   r)   r   r-   r	   r   r   r   r   r   rF  )r  ZX_test_trans_expectedZX_roundtrip_expectedr3  r  r#  rh  Z	n_samplesr   Zexpected_valvalr   r   r   9test_ordinal_encoder_unknown_missing_interaction_both_nan  s(    



rp  c               	   C   s   t d} | ddgddgd}t }|jdd d}t jt|d	 || W 5 Q R X td
djdd}td
djdd}||}||}t|	 | t
| |j dS )z*Check OneHotEncoder works with set_output.rF   r3   r4   r   r   rG   rR  z*Pandas output does not support sparse datar'   Fr   defaultN)r*   rJ   rK   r   rS  r+   r,   r   r	   to_numpyr
   r_   r(  )rL   rM   r   r(   Zohe_defaultZ
ohe_pandas	X_defaultX_pandasr   r   r   test_one_hot_encoder_set_output  s    


ru  c                  C   st   t d} | ddgddgd}t jdd}t jdd}||}||}t| | t|	 |j
 d	S )
z+Check OrdinalEncoder works with set_output.rF   r3   r4   r   r   rG   rq  rR  N)r*   rJ   rK   r   rS  r   r	   rr  r
   r_   r(  )rL   rM   Zord_defaultZ
ord_pandasrs  rt  r   r   r   test_ordinal_set_output  s    


rv  c                  C   sz   dddddgddgg} t | d}|ddgg t| t|jksFtt|jD ]$\}}|jtksftt| | | qPd	S )
zjCheck that the categories_ dtype is `object` for string categories

    Regression test for gh-25171.
    asZmmasZeasZrasZacsrn  2r5   N)	r   r)   rE  r   r   	enumerater@   rc   r
   )r6   r7   nry   r   r   r    test_predefined_categories_dtype  s    
r{  c                  C   s~   t jdgdgt jggtd} tdd| }t|dgdgdgg tddd	| }t d
gg}||}t|dgg dS )zBCheck missing value or unknown encoding can equal the cardinality.r;  ry   r?   r   rX  r   r   r   r   snakeN)	r   r   rz   rc   r   r   r	   r)   r-   )r   r   r7   r  r   r   r   1test_ordinal_encoder_missing_unknown_encoding_max  s    
r}  c                  C   s  t jdgd dgd  dgd  dgd  dgd  gtdj} tdd	d
d| }t| ddddg |jd |j	d  dkst
t jdgd dgd  dgd  gtdj} tdd	dd| }t| dg |jd |j	d  dkst
t jdgd dgd  dgd  dgd  dgd  gtdj} tdd	dgd| }t| ddddg |jd |j	d  dks|t
tdd	dd| }t| dddddg |j	dkst
dS )zkCheck drop_idx is defined correctly with infrequent categories.

    Non-regression test for gh-25550.
    r3   r   r4   r#   r   r   r   r?   Fr   )r   r   r   r   Zx0_dZx0_er  r   rS   r   r   N)r   r   rc   rC   r   r)   r
   r_   r   r   r   )r   r   r   r   r   #test_drop_idx_infrequent_categories  s<    2  
,2  
r~  c                 C   s   t dgd dgd  dgd  dgd  gj}tf d	d
d| |}t|jddddgg t|jddgg dgdgdgdgdgg}dgdgdgdgd
gg}||}t	|| |
|}dgdgdgdgdgg}t|| dS )zGTest parameters for grouping 'a', and 'd' into the infrequent category.r3   r   r4   r   r   rS   r   r   r   r:   r   r   r   r   r   r   N)r   r   rC   r   r)   r
   r   r  r-   r	   r   )r   r  ordinalr  expected_transr   r
  expected_inverser   r   r   ,test_ordinal_encoder_infrequent_three_levels  s.    2 


r  c                  C   s   t jdgd dgd  dgd  dgd  gtd	j} tddddggdd
dd| }t|jddddgg t|jddgg dgdgdgdgdgg}dgdgdgdgdgg}|	|}t
|| ||}dgdgdgdgdgg}t|| dS )zTest that the order of the categories provided by a user is respected.

    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.
    r3   r   r4   r   r   rS   r   r   r?   r   r:   )r6   r   r    r   r   r   r   r   r   N)r   r   rc   rC   r   r)   r
   r   r  r-   r	   r   )r  r  r  r  r   r
  r  r   r   r   6test_ordinal_encoder_infrequent_three_levels_user_catsB  s4    ( 


r  c                  C   s   t dddddddddg	dddddddddg	f} tdd| }t|jd ddg |jd dkshtddgddgg}ddgddgg}||}t|| |	|}t j
ddgddggtd}t|| dS )	zETest when feature 0 has infrequent categories and feature 1 does not.r   r   r   r   r   Nr   r?   )r   Zcolumn_stackr   r)   r
   r  r   r-   r	   r   r   rc   )r   r  r  r  r   r
  r  r   r   r   %test_ordinal_encoder_infrequent_mixedf  s    2


r  c                  C   sV  t d} | ddddg}| jdddddddd	d	g	d
dddddddd
g	| jdgd dgd  dg dg |dddddgd}tdd|}t|jd dd	g t|jd dddg t|jd ddg | jdd	ddgdddd
g| jdgdg dg dg |dddddgd}dddgdddgdddgdddgg}|	|}t
|| dS )zHTest infrequent categories with a pandas DataFrame with multiple dtypes.rF   birdry   r;  r|  r3   r%  r   r4   r   r   r   rS   rV   r#   r?   )r&  r   r   r&  r   r   r'  r  r   r   N)r*   rJ   ZCategoricalDtyperK   r   r   r)   r
   r  r-   r	   )rL   Zcategorical_dtyper   r  r  r  r   r   r   r   :test_ordinal_encoder_infrequent_multiple_categories_dtypes{  s:    



$
r  c                  C   s   t jdgd dgd  dgd  dgd  t jg gtd	j} td
dddd| }t|jdddgg t jdgdgdgdgdgt jggtd	}dgdgdgdgdgdgg}|	|}t
|| dS )zJCheck behavior of unknown_value and encoded_missing_value with infrequent.r3   r   r4   r   r   rS   r   r   r?   r   r   )r    r   r   rW  r   r   r   N)r   r   rz   rc   rC   r   r)   r
   r  r-   r	   )r  r  r  r  r   r   r   r   .test_ordinal_encoder_infrequent_custom_mapping  s"    0 (
r  c                 C   s   t jdgd dgd  dgd  dgd  gtd	j}tf | d
dd|}td
dd|}dgdgdgdgdgg}t|||| dS )zMAll categories are considered frequent have same encoding as default encoder.r3   r   r4   r   r   rS   r   r   r?   r   r:   r   r   Nr   r   rc   rC   r   r)   r	   r-   )r   r  Zadjusted_encoderZdefault_encoderr  r   r   r   !test_ordinal_encoder_all_frequent  s,    	(   
  r  d   c                 C   s   t jdgd dgd  dgd  dgd  gtd	j}tf | d
dd|}dgdgdgdgdgg}t||dgdgdgdgdgg dS )zAWhen all categories are infrequent, they are all encoded as zero.r3   r   r4   r   r   rS   r   r   r?   r   r:   r   r   r   Nr  )r   r  r^  r  r   r   r   #test_ordinal_encoder_all_infrequent  s    	(   
r  c                  C   s   t jt jgd dgd  dgd  dg dg gtdj} td	d
| }t jdddt jggtdj}||}t|dgdgdgt jgg dS )z5Check behavior when missing value appears frequently.r   r;  rS   ry   r   r|  deerr?   r   r  r   r   r   N	r   r   rz   rc   rC   r   r)   r-   r	   r   r  r  r   r   r   r   -test_ordinal_encoder_missing_appears_frequent  s    ,
r  c                  C   s   t jt jgdgd  dgd  dg dg dgd d	gd  gtd
j} tdd| }t jddgdd	gt jd	gdd	gddggtd
}||}t|ddgddgt jdgddgddgg dS )z7Check behavior when missing value appears infrequently.r;  rS   ry   r   r|  r  redr   greenr?   r#   )r   r   r   r   Nr  r  r   r   r   /test_ordinal_encoder_missing_appears_infrequent  s$    &

r  )}r   numpyr   r*   Zscipyr   Zsklearn.exceptionsr   Zsklearn.preprocessingr   r   Zsklearn.utilsr   Zsklearn.utils._testingr   r	   r
   r   markZparametrizer2   r9   r<   rU  Zfloat32r   rE   rN   ra   rd   ro   rr   ru   r   rc   rz   r>  r~   r   r   r   r   r   r   r   r   Zstr_r   rC   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r  r  r!  r$  r*  r-  r.  r0  r4  r7  r:  rA  rI  rO  rP  rQ  rT  rV  rY  r[  r\  r]  r_  r`  rb  rd  re  ri  rm  rB   rp  ru  rv  r{  r}  r~  r  r  r  r  r  r  r  r  r  r   r   r   r   <module>   s  


<
  
/0&


 &&* !





-8	
	
		






$







 
$[A

	 

%
$

		"

!$$0