U
    -eF                  	   @   s  d Z ddlZddlZddlmZmZ ddlmZ ddl	m
Z
 ddlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZ ddlm Z m!Z! dZ"eddd\Z#Z$ee#e$dd\Z#Z$e %e#Z#ddddgZ&dhdd e' D B Z(ej)*dedd Z+dd  Z,ej)*d!ej-ej.gd"d# Z/d$d% Z0ej)*d&e&ej)*d'ed(d) Z1d*d+ Z2ej)*d,d-d.d/ Z3d0d1 Z4d2d3 Z5d4d5 Z6d6d7 Z7d8d9 Z8ej)*d:d;d<gd=d> Z9d?d@ Z:ej)*dAe&dBdC Z;dDdE Z<dFdG Z=ej)*dHd'dIie>dJej?gej?dJggfd'dIidJdKgdKdJggfi dJdKgddLggfgdMdN Z@dOdP ZAdQdR ZBdSdT ZCdUdV ZDej)*dWdXdYgej)*dZdd[gd\d] ZEd^d_ ZFdS )`zF
Tests for HDBSCAN clustering algorithm
Based on the DBSCAN test code
    N)sparsestats)distance)HDBSCAN)CONDENSED_dtype_condense_tree_do_labelling)_OUTLIER_ENCODING)
make_blobs)fowlkes_mallows_score)_VALID_METRICSeuclidean_distances)BallTreeKDTree)StandardScaler)shuffle)assert_allcloseassert_array_equal      
   )	n_samplesrandom_state   )r   kdtreeballtreebruteautoc                 C   s   h | ]\}}|d  qS )label ).0_outr    r    c/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/sklearn/cluster/tests/test_hdbscan.py	<setcomp>%   s     r%   outlier_typec                 C   s   t jt jd|  }dd dd d|  }t|  d }t|  d }t }|dg|d< ||g|d	< t |}|j|k	 \}t
|dd	g ||j|	 \}t
|dd	g ttdd	ttd
d }	t ||	 }
t
|
j|j|	  dS )O
    Tests if np.inf and np.nan data are each treated as special outliers.
    )infinitemissingc                 S   s   | |kS Nr    xyr    r    r$   <lambda>2       z#test_outlier_data.<locals>.<lambda>c                 S   s
   t | S r*   )npisnanr+   r    r    r$   r.   3   r/   r   prob   r         r   N)r0   infnanr	   Xcopyr   fitlabels_Znonzeror   Zprobabilities_listrange)r&   ZoutlierZ
prob_checkr   r2   	X_outliermodelmissing_labels_idxZmissing_probs_idxZclean_indicesclean_modelr    r    r$   test_outlier_data(   s.    rB   c               	   C   s   t t} |  }tddd| }t| | tt|t }|t	ksHt
tt|}|dks^t
d}tjt|d tdddt W 5 Q R X d}d| d	< d
| d< tjt|d tdd|  W 5 Q R X dS )zy
    Tests that HDBSCAN works with precomputed distance matrices, and throws the
    appropriate errors when needed.
    precomputedT)metricr9   \(\?z*The precomputed distance matrix.*has shapematchz'The precomputed distance matrix.*valuesr   )r   r3   r3   )r3   r   rD   N)r   r8   r9   r   fit_predictr   lensetOUTLIER_SETn_clusters_trueAssertionErrorr   r-   pytestraises
ValueError)DZ
D_originallabels
n_clustersscoremsgr    r    r$   test_hdbscan_distance_matrixH   s     

rW   sparse_constructorc                 C   sz   t t t}|t| }t| d}d|||k< | |}|	  t
dd|}tt|t }|tksvtdS )zA
    Tests that HDBSCAN works with sparse distance matrices.
    2           rC   rH   N)r   Z
squareformZpdistr8   r0   maxr   ZscoreatpercentileflattenZeliminate_zerosr   rI   rJ   rK   rL   rM   rN   )rX   rR   	thresholdrS   rT   r    r    r$   #test_hdbscan_sparse_distance_matrixf   s    r^   c                  C   sB   t  t} tt| t }|tks(ttt	| }|dks>tdS )z
    Tests that HDBSCAN works with feature array, including an arbitrary
    goodness of fit check. Note that the check is a simple heuristic.
    rE   N)
r   rI   r8   rJ   rK   rL   rM   rN   r   r-   )rS   rT   rU   r    r    r$   test_hdbscan_feature_arrayy   s
    
r_   algorD   c              	   C   s  t | dt}tt|t }|tks,t| dkr8dS tt	d}dt
tjd idt
tjd iddidt
tjd d	d
|d}t | ||d}|||  jkrtt |t W 5 Q R X n4|dkrtt |t W 5 Q R X n
|t dS )z
    Tests that HDBSCAN works with the expected combinations of algorithms and
    metrics, or raises the expected errors.
    )	algorithm)r   r   N)r   r   Vr3   p   )rc   w)Zmahalanobis
seuclideanZ	minkowski
wminkowskira   rD   metric_paramsrg   )r   rI   r8   rJ   rK   rL   rM   rN   r   r   r0   eyeshapeonesgetvalid_metricsrO   rP   rQ   r:   ZwarnsFutureWarning)r`   rD   rS   rT   ZALGOS_TREESri   hdbr    r    r$   test_hdbscan_algorithms   s8     rq   c                  C   s6   t  t} | d}tt|t }|tks2tdS )z
    Tests that HDBSCAN can generate a sufficiently accurate dbscan clustering.
    This test is more of a sanity check than a rigorous evaluation.

    TODO: Improve and strengthen this test if at all possible.
    333333?N)	r   r:   r8   dbscan_clusteringrJ   rK   rL   rM   rN   )Z	clustererrS   rT   r    r    r$   test_dbscan_clustering   s    
rt   cut_distance)皙?      ?r3   c                 C   s   t d d }t d d }t }tjdg|d< dtjg|d< tjtjg|d< t |}|j| d}t	||k}t
|ddg t	||k}t
|dg tttd	t||  }t || }	|	j| d}
t
|
||  d
S )r'   r)   r   r(   r3   r   rd   r4   )ru   r   N)r	   r8   r9   r0   r6   r7   r   r:   rs   Zflatnonzeror   r<   rK   r=   )ru   Zmissing_labelZinfinite_labelr>   r?   rS   r@   Zinfinite_labels_idxZ	clean_idxrA   Zclean_labelsr    r    r$   #test_dbscan_clustering_outlier_data   s     rx   c                  C   sb   t dddd\} }t | } tdddt| jd id	| }tt	|t
 }|tks^td
S )zQ
    Tests that HDBSCAN using `BallTree` works with higher-dimensional data.
    rY   r   @   )r   r   Z
n_featuresr   rf   rb   r3   rh   N)r
   r   fit_transformr   r0   rl   rk   rI   rJ   rK   rL   rM   rN   )Hr-   rS   rT   r    r    r$   test_hdbscan_high_dimensional   s    r|   c                  C   sB   t ddttjd idt} tt| t }|t	ks>t
dS )z4
    Tests that HDBSCAN using `BallTree` works.
    rf   rb   r3   )rD   ri   N)r   r0   rl   r8   rk   rI   rJ   rK   rL   rM   rN   rS   rT   r    r    r$   !test_hdbscan_best_balltree_metric   s     r~   c                  C   s8   t ttd dt} tt| t }|dks4tdS )z
    Tests that HDBSCAN correctly does not generate a valid cluster when the
    `min_cluster_size` is too large for the data.
    r3   min_cluster_sizer   N)r   rJ   r8   rI   rK   rL   rN   r}   r    r    r$   test_hdbscan_no_clusters   s    r   c                  C   s\   t dttdD ]F} t| dt}dd |D }t|dkrtt|| kstqdS )zb
    Test that the smallest non-noise cluster has at least `min_cluster_size`
    many points
    rd   r3   r   c                 S   s   g | ]}|d kr|qS )r   r    )r!   r   r    r    r$   
<listcomp>  s      z1test_hdbscan_min_cluster_size.<locals>.<listcomp>r   N)	r=   rJ   r8   r   rI   r0   minZbincountrN   )r   rS   Ztrue_labelsr    r    r$   test_hdbscan_min_cluster_size   s
    r   c                  C   s6   t j} t| dt}tt|t }|tks2t	dS )zA
    Tests that HDBSCAN works when passed a callable metric.
    rH   N)
r   	euclideanr   rI   r8   rJ   rK   rL   rM   rN   )rD   rS   rT   r    r    r$   test_hdbscan_callable_metric
  s    r   treekdZballc              	   C   s8   t dd|  dd}tt |t W 5 Q R X dS )z
    Tests that HDBSCAN correctly raises an error when passing precomputed data
    while requesting a tree-based algorithm.
    rC   Zprims_r   rD   ra   N)r   rO   rP   rQ   r:   r8   )r   rp   r    r    r$   "test_hdbscan_precomputed_non_brute  s    r   c            	   	   C   s   t  tj} tt| t }|dks*tt	t}|
 }t  |j}t| | tjdftjdffD ]\}}t
 }||d< t  |j} tt| t }|dkst| d t| d kst|
 }||d< t  |j}t| | qhd}tjt|d t d	d
d| W 5 Q R X dS )z
    Tests that HDBSCAN works correctly when passing sparse feature data.
    Evaluates correctness by comparing against the same data passed as a dense
    array.
    r   r(   r)   r   r   r   r   z4Sparse data matrices only support algorithm `brute`.rF   r   r   r   N)r   r:   r8   r;   rJ   rK   rL   rN   r   
csr_matrixr9   r   r0   r6   r7   r	   rO   rP   rQ   )	Zdense_labelsrT   Z	_X_sparseZX_sparseZsparse_labelsZoutlier_valr&   ZX_denserV   r    r    r$   test_hdbscan_sparse  s*    

r   ra   c                 C   s   ddg}t dd|dd\}}tdd|}t||j|jD ]*\}}}t||d	d
d t||d	d
d q<t| dtjd dt}|jjd dkst	|jjd dkst	dS )zj
    Tests that HDBSCAN centers are calculated and stored properly, and are
    accurate to the data.
    )rZ   rZ   )      @r   i  r   rw   )r   r   centerscluster_stdZboth)store_centersr3   g?)ZrtolZatol)ra   r   r   N)
r
   r   r:   zipZ
centroids_Zmedoids_r   r8   rk   rN   )ra   r   r{   r"   rp   centerZcentroidZmedoidr    r    r$   test_hdbscan_centersC  s      r   c                  C   s   t jd} | dd}tddddd|}t j|dd	\}}t|dksPt||d
k dksdttdddddd|}t j|dd	\}}t|dkst||d
k dkstdS )zS
    Tests that HDBSCAN single-cluster selection with epsilon works correctly.
    r      rd   r4   rZ   ZeomT)r   cluster_selection_epsiloncluster_selection_methodallow_single_cluster)Zreturn_countsr      g
ףp=
?r   )r   r   r   r   ra   N)	r0   randomZRandomStateZrandr   rI   uniquerJ   rN   )rngZno_structurerS   Zunique_labelscountsr    r    r$   .test_hdbscan_allow_single_cluster_with_epsilonY  s2    r   c                  C   sp   ddgddgddgddgg} t d| ddddgd	d
\}}t |}tt|jtd|jk }|dksltdS )z
    Validate that HDBSCAN can properly cluster this difficult synthetic
    dataset. Note that DBSCAN fails on this (see HDBSCAN plotting
    example)
    g333333g333333?r   i  皙?gffffff?g?r   )r   r   r   r   r      N)r
   r   r:   rJ   rK   r;   intrN   )r   r8   r"   rp   rT   r    r    r$   test_hdbscan_better_than_dbscanz  s    

r   z	kwargs, XrC   r3   rd   r   c                 C   s   t f ddi||  dS )zo
    Tests that HDBSCAN works correctly for array-likes and precomputed inputs
    with non-finite points.
    min_samplesr3   N)r   r:   )r8   kwargsr    r    r$   test_hdbscan_usable_inputs  s    r   c               	   C   sB   t td} d}tjt|d tdd|  W 5 Q R X dS )zd
    Tests that HDBSCAN raises the correct error when there are too few
    non-zero distances.
    )r   r   z#There exists points with fewer thanrF   rC   rH   N)	r   r   r0   ZzerosrO   rP   rQ   r   r:   )r8   rV   r    r    r$   -test_hdbscan_sparse_distances_too_few_nonzero  s    r   c               	   C   s   dd } d}t jt|d td| dt W 5 Q R X t jt|d td| dt W 5 Q R X tttj	tt
j	 }t|dkrt jt|d td|d dt W 5 Q R X d	S )
zR
    Tests that HDBSCAN correctly raises an error for invalid metric choices.
    c                 S   s   | S r*   r    )r,   r    r    r$   r.     r/   z2test_hdbscan_tree_invalid_metric.<locals>.<lambda>zV.* is not a valid metric for a .*-based algorithm\. Please select a different metric\.rF   r   )ra   rD   r   r   N)rO   rP   rQ   r   r:   r8   r<   rK   r   rn   r   rJ   )Zmetric_callablerV   Zmetrics_not_kdr    r    r$    test_hdbscan_tree_invalid_metric  s    r   c               	   C   s>   t ttd d} d}tjt|d | t W 5 Q R X dS )zx
    Tests that HDBSCAN correctly raises an error when setting `min_samples`
    larger than the number of samples.
    r3   )r   z min_samples (.*) must be at mostrF   N)r   rJ   r8   rO   rP   rQ   r:   )rp   rV   r    r    r$   !test_hdbscan_too_many_min_samples  s    r   c               	   C   sH   t  } tj| d< d}tdd}tjt|d ||  W 5 Q R X dS )zu
    Tests that HDBSCAN correctly raises an error when providing precomputed
    distances with `np.nan` values.
    r   z(np.nan values found in precomputed-denserC   rH   rF   N)	r8   r9   r0   r7   r   rO   rP   rQ   r:   )ZX_nanrV   rp   r    r    r$   "test_hdbscan_precomputed_dense_nan  s    

r   r   TFepsilonrv   c                    s   d}t || ddgddgddggd\}t |}t|j|jd}|d |d |d h}|d d|d d	|d di}t|||||d
fddttD   fddttD }	t	
|	j}
t|
 dS )zR
    Tests that the `_do_labelling` helper function correctly assigns labels.
    0   r   r   )r   r   r   rd   r   r   r3   condensed_treeclusterscluster_label_mapr   r   c                    s$   i | ]}|t  |kd  d  qS )r   )r0   wherer!   Z_y)r-   r    r$   
<dictcomp>  s      z+test_labelling_distinct.<locals>.<dictcomp>c                    s   i | ]}| |  qS r    r    r   )first_with_labelrS   r    r$   r     s      N)r
   r   r:   r   Z_single_linkage_tree_r   r   r<   rK   r0   Z	vectorizerm   r   )Zglobal_random_seedr   r   r   r8   Zestr   r   r   Zy_to_labelsZaligned_targetr    )r   rS   r-   r$   test_labelling_distinct  s6    
 r   c                  C   s   d} d}t jdd|dfddd|dfddgtd	}t|| h| d| d did
dd}|d dk }t|t|dkksttt|| h| d| d did
dd}|d |k }t|t|dkkstdS )z
    Tests that the `_do_labelling` helper function correctly thresholds the
    incoming lambda values given various `cluster_selection_epsilon` values.
    r4   g      ?rd   r3   )r4   r3   rv   r3   r   )r4   r   r   r3   )r4   r   rr   r3   )ZdtypeTr   valuer   N)r0   arrayr   r   sumrN   )r   Z
MAX_LAMBDAr   rS   Z	num_noiser    r    r$   test_labelling_thresholding  s:    



r   )G__doc__numpyr0   rO   Zscipyr   r   Zscipy.spatialr   Zsklearn.clusterr   Zsklearn.cluster._hdbscan._treer   r   r   Z sklearn.cluster._hdbscan.hdbscanr	   Zsklearn.datasetsr
   Zsklearn.metricsr   Zsklearn.metrics.pairwiser   r   Zsklearn.neighborsr   r   Zsklearn.preprocessingr   Zsklearn.utilsr   Zsklearn.utils._testingr   r   rM   r8   r-   rz   Z
ALGORITHMSitemsrL   markZparametrizerB   rW   r   Z
csc_matrixr^   r_   rq   rt   rx   r|   r~   r   r   r   r   r   r   r   r   r   r6   r   r   r   r   r   r   r   r    r    r    r$   <module>   s   

(




$
! 
$