U
    -e                     @   s   d dl Zdd ZdddZdS )    Nc                 C   s   ||  |    }t|}t||   }|dkr|| }tt|ddd }|D ]V}t||k\}	tt|	|}
|j	|	|
dd}	||	  d7  < ||
8 }|dkrX qqX|
tjS )a  Computes approximate mode of multivariate hypergeometric.
    This is an approximation to the mode of the multivariate
    hypergeometric given by class_counts and n_draws.
    It shouldn't be off by more than one.
    It is the mostly likely outcome of drawing n_draws many
    samples from the population given by class_counts.
    Args
    ----------
    class_counts : ndarray of int
        Population per class.
    n_draws : int
        Number of draws (samples to draw) from the overall population.
    rng : random state
        Used to break ties.
    Returns
    -------
    sampled_classes : ndarray of int
        Number of samples drawn from each class.
        np.sum(sampled_classes) == n_draws

    r   NF)sizereplace   )sumnpfloorintsortuniquewhereminlenchoiceZastypeZint64)class_countsZn_drawsrngZ
continuousZflooredZneed_to_add	remaindervaluesvalueZindsZadd_now r   X/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/datasets/utils/stratify.pyapproximate_mode   s    
r   
   c              	   c   sN  t j| dd\}}|jd }t |}t |dk r<td||k rTtd||f ||k rltd||f t t j|dd	t |d
d }	t	|D ]}
t
|||}|| }t
|||}g }g }t	|D ]\}||| }|	| j|dd}||d
||   |||| || ||    q||}||}||fV  qd
S )a  

    Provides train/test indices to split data in train/test sets.
    It's reference is taken from StratifiedShuffleSplit implementation
    of scikit-learn library.

    Args
    ----------

    n_train : int,
        represents the absolute number of train samples.

    n_test : int,
        represents the absolute number of test samples.

    random_state : int or RandomState instance, default=None
        Controls the randomness of the training and testing indices produced.
        Pass an int for reproducible output across multiple function calls.

    n_splits : int, default=10
        Number of re-shuffling & splitting iterations.
    T)Zreturn_inverser      zMinimum class count errorzLThe train_size = %d should be greater or equal to the number of classes = %dzKThe test_size = %d should be greater or equal to the number of classes = %dZ	mergesort)kindNr   Zclip)mode)r   r   shapeZbincountr   
ValueErrorsplitZargsortZcumsumranger   permutationZtakeextend)yZn_trainZn_testr   Zn_splitsclassesZ	y_indicesZ	n_classesr   Zclass_indices_Zn_iZclass_counts_remainingZt_itraintestir    Zperm_indices_class_ir   r   r   )stratified_shuffle_split_generate_indices6   s8    



$$

r(   )r   )numpyr   r   r(   r   r   r   r   <module>   s   2