U
    -e                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlZd dlZd dlmZmZ d dlmZ d dlmZ d d	lmZmZmZmZmZmZmZmZ d d
lmZm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+m,Z, dZ-dZ.e-e. Z/dd Z0dd Z1dd Z2dd Z3dd Z4dd Z5ej67deefdd Z8d d! Z9d"d# Z:d$d% Z;d&d' Z<d(d) Z=d*d+ Z>d,d- Z?d.d/ Z@d0d1 ZAd2d3 ZBd4d5 ZCd6d7 ZDd8d9 ZEd:d; ZFd<d= ZGd>d? ZHd@dA ZIdBdC ZJdDdE ZKdFdG ZLdHdI ZMe+dJdK ZNdLdM ZOej67deefdNdO ZPdPdQ ZQdRdS ZRdTdU ZSdVdW ZTe+dXdY ZUej67deefdZd[ ZVd\d] ZWd^d_ ZXd`da ZYe+dbdc ZZddde Z[dfdg Z\ej67dhej]ej^ej_gdidj Z`dkdl Zadmdn Zbdodp Zcdqdr Zddsdt Zedudv Zfdwdx Zgdydz Zhe+d{d| Zid}d~ Zjdd Zkdd Zlej67deeefdd Zmej67dejnejogdd Zpdd Zqej67dejrejodfejsejodfejnejndfejoejodfgdd Ztej67deddeddeddgdd Zudd Zve+dd Zwe,dd Zxe+ej67deeegdd Zyej67deeegej67ddezdfde{dfgdd Z|ej67deeej}ee+dgej67ddd dd gej67dddgdd Z~ej67deeegdd Zej67deeegej67dddgddddddddf	ddd dddddddf	ddd dddddddf	dddd dddd dddf	ddddddd dddf	dgddƄ Zej67dedddʜddd̜gfee-ffdd΄ ZddЄ Ze+dd҄ Zej67deeeegddԄ ZdS )    N)defaultdict)Mapping)partial)StringIO)assert_array_almost_equalassert_array_equal)sparse)clone)ENGLISH_STOP_WORDSCountVectorizerHashingVectorizerTfidfTransformerTfidfVectorizerstrip_accents_asciistrip_accents_unicode
strip_tags)GridSearchCVcross_val_scoretrain_test_split)Pipeline)	LinearSVC)IS_PYPY)assert_allclose_dense_sparseassert_almost_equalfails_if_pypyskip_if_32bit)zthe pizza pizza beer copyrightzthe pizza burger beer copyrightz!the the pizza beer beer copyrightzthe burger beer beer copyrightzthe coke burger coke copyrightzthe coke burger burger)zthe salad celeri copyrightz)the salad salad sparkling water copyrightzthe the celeri celeri copyrightzthe tomato tomato salad waterz the tomato salad water copyrightc                 C   s   t |  S N)r   uppers r    k/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/sklearn/feature_extraction/tests/test_text.py	uppercase8   s    r"   c                 C   s   |  ddS )N   ée)replacer   r    r    r!   strip_eacute<   s    r&   c                 C   s   |   S r   splitr   r    r    r!   split_tokenize@   s    r)   c                 C   s   dgS )NZthe_ultimate_featurer    r   r    r    r!   lazy_analyzeD   s    r*   c                  C   s   d} d}t | |kstd} d}t | |ks0td} d}t | |ksHtd} d}t | |ks`td	} d
}t | |ksxtd} d}t | |kstd} d
}t | |kstd S )N   àáâãäåçèéêëaaaaaaceeee   ìíîïñòóôõöùúûüýiiiinooooouuuuy   إu   ا   this is à testthis is a testu   öou   ̀́̂̃ u   ȫ)r   AssertionErroraexpectedr    r    r!   test_strip_accentsH   s*    r8   c                  C   sd   d} d}t | |kstd} d}t | |ks0td} d}t | |ksHtd} d}t | |ks`td S )	Nr+   r,   r-   r.   r/   r3   r0   r1   )r   r4   r5   r    r    r!   test_to_asciil   s    r9   
Vectorizerc              
   C   s  | dd  }d}dddddd	d
dddg
}|||ks:td}dddddddg}|||ks`t| dd  }td}dddddddg}|||kst| td  }d}ddd d!d"d#d$d%d&d'g
}|||kst| tdd(  }d}d)ddddd*d+ddd,g
}|||kstd S )-Nasciistrip_accents:   J'ai mangé du kangourou  ce midi, c'était pas très bon.aimangedu	kangouroucemidietaitpastresbonz0This is a test, really.

 I met Harry yesterday.thisistestZreallyZmetZharryZ	yesterdayfile)input'This is a test with a file-like object!withlikeobjectpreprocessoru;   J'ai mangé du kangourou  ce midi,  c'était pas très bon.ZAIZMANGEZDUZ	KANGOUROUZCEZMIDIZETAITZPASZTRESZBON)	tokenizerr=   zj'aizmidi,zc'etaitzbon.)build_analyzerr4   r   r"   r)   )r:   watextr7   r    r    r!   test_word_analyzer_unigrams   sb    rX   c                  C   sT   t dddd } d}dddd	d
ddddddddddddddg}| ||ksPtd S )Nwordunicode      analyzerr=   ngram_ranger>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   zai mangezmange duzdu kangourouzkangourou cezce midiz
midi etaitz	etait paszpas tresztres bon)r   rU   r4   )rV   rW   r7   r    r    r!   'test_word_analyzer_unigrams_and_bigrams   s6      
ra   c               	   C   sp   d} |  d}tddd }tt || W 5 Q R X tdddd }tt || W 5 Q R X d S )	Nr>   zutf-8r[   r;   )r`   encodingchar      )r_   r`   rb   )encoder   rU   pytestraisesUnicodeDecodeError)rW   Z
text_bytesrV   car    r    r!   test_unicode_decode_error   s    
  
rl   c                  C   s   t dddd } d}dddd	d
g}| |d d |ks<tdddddg}| |dd  |ksbtd}dddddg}| |d d |kstdddddg}| |dd  |kstt dddd } td}dddddg}| |d d |kstd S ) Nrc   rZ   rd   r^   u9   J'ai mangé du kangourou  ce midi, c'était pas très bonzj'az'aizai zi mz ma   zs tresz tres ztres bzres bozes bon1This 
	is a test, really.

 I met Harry yesterdaythihisis zs iz isz yesteyesteresterdsterdaterdayrL   rM   r_   r`   rN   r   rU   r4   r   cngarW   r7   r    r    r!   test_char_ngram_analyzer   s.      
  
r{   c                  C   s   t dddd } d}dddd	d
g}| |d d |ks<tdddddg}| |dd  |ksbtt dddd } td}ddddddg}| |d d |kstd S )NZchar_wbrZ   rd   r^   ro   z thrp   rq   rr   z thirm   rs   rt   ru   rv   zerday rn   rL   rw   zA test with a file-like object!z a z teZtesestzst z tesrf   rx   ry   r    r    r!   test_char_wb_ngram_analyzer
  s$      
  
r}   c                  C   s   t dddd } d}dddg}| |d d	 |ks8td
ddg}| |dd  |ksZtt dddd }t|}||| |kstd S )NrY   rZ   rd   r^   ro   zthis is testzis test reallyztest really metre   ztest really met harry yesterdayzthis is test really met harryz"is test really met harry yesterdayrL   rw   rx   )rz   rW   r7   Z	cnga_filerL   r    r    r!   test_word_ngram_analyzer  s(      

  
r   c                  C   s   ddd} t |  }ttttttfD ]}|| }t|d}|	t
 t|trb|j| ksttnt |j|kstt|t
}|jd t|kst|| }t|d}||}t||jd ks(tq(d S )Nr   r\   pizzabeer
vocabulary)setkeysdictlistiterr   r   intr   fitJUNK_FOOD_DOCS
isinstancer   vocabulary_r4   	transformshapeleninverse_transform)vocabtermstypvvectXinvr    r    r!   &test_countvectorizer_custom_vocabulary5  s    






r   c                  C   sd   ddg} t dt| dfdt fg}|t}t|jd jt| ksJt|j	d t
| ks`td S )Nr   r   countr   tfidfr\   )r   r   r   fit_transformALL_FOOD_DOCSr   named_stepsr   r4   r   r   )Zwhat_we_likepiper   r    r    r!   /test_countvectorizer_custom_vocabulary_pipelineJ  s    
r   c               	   C   sB   ddd} d}t jt|d t| d}|dg W 5 Q R X d S )Nr   r   z$Vocabulary contains repeated indicesmatchr   Zpasta_sizilianarh   ri   
ValueErrorr   r   )r   msgr   r    r    r!   7test_countvectorizer_custom_vocabulary_repeated_indicesW  s
    

r   c               	   C   s>   ddd} t jtdd t| d}|dg W 5 Q R X d S )Nr\   r]   r   zdoesn't contain indexr   r   Zpasta_verdurar   r   r   r    r    r!   0test_countvectorizer_custom_vocabulary_gap_index_  s    

r   c               	   C   s   t  } | jdd |  tks"t| jdd tt |   W 5 Q R X | jdd tt |   W 5 Q R X dddg}| j|d |  t|kstd S )Nenglish
stop_wordsZ_bad_str_stop_Z_bad_unicode_stop_Zsomeotherwords)	r   
set_paramsget_stop_wordsr
   r4   rh   ri   r   r   )cvZstoplistr    r    r!   test_countvectorizer_stop_wordsf  s    
r   c               	   C   sj   t jtdd tg d} | dg W 5 Q R X t jtdd" tddd}|dd	d
g W 5 Q R X d S )Nzempty vocabularyr   r   foo      ?r   )max_dfr   zto be or not to bez
and me toozand so do your   )r   r   r    r    r!   %test_countvectorizer_empty_vocabularyu  s    
r   c                  C   sF   t  } | td d }| tdd  }|jd |jd ksBtd S )Nrm   r\   )r   r   r   r   r4   )r   ZX1X2r    r    r!   test_fit_countvectorizer_twice  s    r   c                  C   sD   ddddg} d}t |d}||  ddd	g}| }t|| d
S )zCheck `get_feature_names_out()` when a custom token pattern is passed.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    &This is the 1st document in my corpus. This document is the 2nd sample.And this is the 3rd one.Is this the 4th document?z'[0-9]{1,3}(?:st|nd|rd|th)\s\b(\w{2,})\btoken_patterndocumentonesampleN)r   r   get_feature_names_outr   )corpusr   
vectorizerr7   feature_names_outr    r    r!   )test_countvectorizer_custom_token_pattern  s    


r   c               	   C   sF   ddddg} d}d}t |d}tjt|d ||  W 5 Q R X d	S )
zCheck that we raise an error if token pattern capture several groups.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    r   r   r   r   z)([0-9]{1,3}(?:st|nd|rd|th))\s\b(\w{2,})\bz,More than 1 capturing group in token patternr   r   Nr   rh   ri   r   r   )r   r   err_msgr   r    r    r!   <test_countvectorizer_custom_token_pattern_with_several_group  s    
r   c               	   C   sn   ddddg} d}t d| d}tjt|d ||  W 5 Q R X t  td	t ||  W 5 Q R X d S )
NZSampleZUpperZCaseZ
VocabularyzyUpper case characters found in vocabulary while 'lowercase' is True. These entries will not be matched with any documentsT)	lowercaser   r   error)	r   rh   warnsUserWarningr   warningscatch_warningssimplefilterr   )r   messager   r    r    r!   'test_countvectorizer_uppercase_in_vocab  s    
r   c                  C   sP   dddgdddgdddgg} t ddd| }dddg}||}t|| d	S )
z0Check get_feature_names_out for TfidfTransformerr\   r   Tl2
smooth_idfnormr6   cbN)r   r   r   r   )r   trZfeature_names_inr   r    r    r!   %test_tf_transformer_feature_names_out  s
    

r   c                  C   s   dddgdddgdddgg} t ddd}||  }|dk sFtt|d jdddddg dddgdddgdddgg} t ddd}||  }|dk std S )	Nr\   r   Tr   r   r]   Zaxisr   )r   r   toarrayallr4   r   sumr   r   r   r    r    r!   test_tf_idf_smoothing  s    r   c               	   C   s   dddgdddgdddgg} t ddd}||  }|dk sFtt|d jdddddg dddgdddgdddgg} t ddd}d	}tjt	|d
 ||   W 5 Q R X d S )Nr\   r   Fr   r   r]   r   r   zdivide by zeror   )
r   r   r   r   r4   r   r   rh   r   RuntimeWarning)r   r   r   Zin_warning_messager    r    r!   test_tfidf_no_smoothing  s    r   c                  C   s   dgdgdgg} t ddd d}||  }|d dks<t|d |d ksPt|d |d ksdt|d dk stt|d dk std S )Nr\   r]   re   TF)sublinear_tfuse_idfr   r   )r   r   r   r4   r   r    r    r!   test_sublinear_tf  s    r   c               	   C   sp  t td d } td g}ttd }tdd}|| }t|drL| }|d|jd f dksftt|jd	}||fD ]}|	|}t|dr| }|j}|d|d
 f dkst|d|d f dkst|d|d f dkstd|kstd|kst|d|d f dkst|d|d f dks6t|d|d f dksPt|d|d f dksztqzt
dd}	|	|	| }
t|	jt|jkst|
j|t|jfkst|		| }|jt|t|jfkstt
ddd}||	| }t|drtt
dd}tt |	| W 5 Q R X ttj|dddg|  t td d } tdd}|j|_||  }|jrtt|
| |	| }t|| td d	}tt |	|  W 5 Q R X |jddd | }d}t|}||}||kst|jdd d tt |  W 5 Q R X d |_tt |  W 5 Q R X d S )!Nr\         ?r   tocsrr   r   r]   r   saladtomatowaterthe	copyrightcokeburgerr   l1r   F)r   r   idf_Tr   r   r   r;   )r=   r   r>   Z_gabbledegook_)r=   rS   Z_invalid_analyzer_type_)r   r   r   r   r   hasattrr   r   r4   r   r   r   r   r   r   rh   ri   r   r   npr   r   r   fixed_vocabulary_r   build_preprocessorr   rU   )
train_data	test_dataZn_trainZv1Zcounts_trainZv2r   Zcounts_testr   t1r   Z
tfidf_testt2tft3tvZtfidf2Ztfidf_test2Zv3	processorrW   r7   resultr    r    r!   test_vectorizer  sv    











r  c                  C   s  d\} }}}t | |||d}|t |jj| ks6t|jj|ksFt|jj|ksVt|jj|ksftd|_d|_d|_d|_|jj| kst|jj|kst|jj|kst|jj|kst|t |jj|jkst|jj|jkst|jj|jkst|jj|jkstd S )N)r   FFF)r   r   r   r   r   T)	r   r   r   _tfidfr   r4   r   r   r   )r   r   r   r   r   r    r    r!   test_tfidf_vectorizer_settersa  s2       

r  c                  C   s  t  } | t}|j}|jtt| jfks.t|j| jks>tt	
|jdksRtt	
|jdk sftt	|jdksztt	|jdk stt|jd D ]}tt	j|d jdd qt ddd} | t}|jtt| jfkst|j| jkst|j}||kst|d| k s tt	
|jdks6tt	|jdk sLtt|jd D ] }tt	j|d jdd qZd S )	Nr   r   r\   r]   r   r[   r   )r`   r   )r   r   r   nnzr   r   
n_featuresr4   dtyper   mindatamaxranger   Zlinalgr   )r   r   Z	token_nnziZ
ngrams_nnzr    r    r!   test_hashing_vectorizer~  s,    

r  c               
   C   sN  t dd} tt |   W 5 Q R X | jr2t| t}|j	\}}t
| j|ksXt|  }t|tjspt|jtks~tt
||ksttddddddd	d
dg	| t|D ]\}}|| j|kstqddddddd	d
dg	}t |d} |  }tddddddd	d
dg	| | js tt|D ] \}}|| j|ks(tq(d S )Nr   r   r   r   celerir   r   r   	sparklingr   r   r   )r   rh   ri   r   r   r   r4   r   r   r   r   r   r   r   Zndarrayr  rQ   r   	enumerateget)r   r   Z	n_samplesr  Zfeature_namesidxnamer   r    r    r!   test_feature_names  sl    




r  c                 C   sX   ddddh}ddddd	d
dh}| ddd}| t t|j|ksFt|j|ksTtd S )Nr   r   r   r   r  r   r   r   r  r   r   g333333?   )r   max_features)r   r   r   r   r4   stop_words_)r:   Zexpected_vocabularyZexpected_stop_wordsr   r    r    r!   test_vectorizer_max_features  s    
r  c            	      C   s   t dd} t dd}t d d}| tjdd}|tjdd}|tjdd}|  }| }| }d| ks|td| kstd| kstd|t| kstd|t| kstd|t| kstd S )Nr\   r  re   r   r      r   )	r   r   r   r   r   r
  r4   r   Zargmax)	Zcv_1Zcv_3Zcv_NoneZcounts_1Zcounts_3Zcounts_NoneZ
features_1Z
features_3Zfeatures_Noner    r    r!   "test_count_vectorizer_max_features   s    


r  c                  C   s  dddg} t ddd}||  d|j ks2tt|j dksHtt|jd	ksZtd
|_||  d|j ks|tt|j dkstd|jkstt|jdkstd|_||  d|j kstt|j dkstd|jkstt|jdkstd S )Nabcdeaeatrc   r   r_   r   r6   rf   r   r   r  r]   r\   )r   r   r   r   r4   r   r  r   r   r   r    r    r!   test_vectorizer_max_df  s$    



r!  c                  C   s  dddg} t ddd}||  d|j ks2tt|j dksHtt|jd	ksZtd
|_||  d|j ks|tt|j d
kstd|jkstt|jdkstd|_||  d|j kstt|j dkstd|jkstt|jdkstd S )Nr  r  r  rc   r\   )r_   min_dfr6   rf   r   r]   r   r  g?rm   )r   r   r   r   r4   r   r  r"  r   r    r    r!   test_vectorizer_min_df1  s$    



r#  c                  C   s   ddg} t ddd}||  }tdddd	d
g|  tdddddgdddddgg| t dddd}||  }tdddddgdddddgg| t dddtjd}|| }|jtjkstd S )Naaabcabbderc   r   r  r6   r   r   dr$   re   r\   r   r]   T)r_   r   binary)r_   r   r'  r  )	r   r   r   r   r   r   float32r  r4   )r   r   r   ZX_sparser    r    r!   test_count_binary_occurrencesH  s    ""
r)  c                  C   s   ddg} t ddd d}|| }t|dd jdks<tt|dd	 jd	ksXt|jtjkshtt ddd
d d}|| }t|jdkst|jtjkstt ddd
d tjd}|| }|jtjkstd S )Nr$  r%  Frc   )alternate_signr_   r   r   r\   re   r]   T)r_   r*  r'  r   )r_   r*  r'  r   r  )r   r   r   r
  r	  r4   r  float64)r   r   r   r    r    r!   test_hashed_binary_occurrences\  s0    
   
    
r,  c                 C   s  t }|  }||}||}t|ts,t| }t||D ]6\}}t	t
||}t	t
|}t|| q>t|st|jdkst| }	||	}
t||
D ]\}}tt	|t	| q| }||}t||D ]\}}tt	|t	| qd S )NZcsr)r   r   r   r   r   r4   rU   zipr   sortuniquer   r   issparseformatr   Ztocsc)r:   r	  r   Ztransformed_dataZinversed_dataZanalyzedocZinversed_termsr   Ztransformed_data2Zinversed_data2Zterms2Ztransformed_data3Zinversed_data3Zterms3r    r    r!   !test_vectorizer_inverse_transformw  s(    



r3  c                  C   s   t t } dgtt  dgtt  }t| |ddd\}}}}tdt fdtdd	fg}d
dgdd}t||ddd}|||	|}	t
|	| |jdkst|jjd }
|
jd
kstd S )Nr   r\   g?r   Z	test_sizerandom_stater   svcautoZdualr\   r\   r[   ZhingeZsquared_hinge)vect__ngram_range	svc__lossre   )n_jobsr   r   )r   NOTJUNK_FOOD_DOCSr   r   r   r   r   r   r   predictr   best_score_r4   best_estimator_r   r`   r	  targetr   r   Ztarget_trainZtarget_testpipeline
parametersZgrid_searchpredZbest_vectorizerr    r    r!   -test_count_vectorizer_pipeline_grid_selection  s$       
rG  c                  C   s   t t } dgtt  dgtt  }t| |ddd\}}}}tdt fdtdd	fg}d
dgddd}t||dd}|||	|}	t
|	| |jdkst|jjd }
|
jd
kst|
jdkst|
jrtd S )Nr   r\   g?r   r4  r   r6  r7  r8  r9  r[   )r   r   r:  )r;  Z
vect__normr<  )r=  r   r   )r   r>  r   r   r   r   r   r   r   r?  r   r@  r4   rA  r   r`   r   r   rB  r    r    r!   'test_vectorizer_pipeline_grid_selection  s*       
rH  c                  C   sd   t t } dgtt  dgtt  }tdt fdtddfg}t|| |dd}t|d	d	d	g d S )
Nr   r\   r   r6  r7  r8  re   )r   r   )r   r>  r   r   r   r   r   r   )r	  rC  rD  Z	cv_scoresr    r    r!   )test_vectorizer_pipeline_cross_validation  s
    rI  c                  C   sx   d} t  }|| g}|jdks$ttd dd}|| g}|jdksJt|j|jksZttt	|j
t	|j
 d S )Nu   Машинное обучение — обширный подраздел искусственного интеллекта, изучающий методы построения алгоритмов, способных обучаться.)r\      F)r   r*  )r\   i   )r   r   r   r4   r   r   r  r   r   r.  r	  )r   r   Z	X_countedZX_hashedr    r    r!   test_vectorizer_unicode  s    rK  c                  C   sF   ddg} t | d}|t}|t}t| |  |jsBtd S )Nr   r  r   )r   r   r   r   r   r   r   r4   )r   r   ZX_1ZX_2r    r    r!   +test_tfidf_vectorizer_with_fixed_vocabulary  s    


rL  c                  C   s   t  t ddt ddt ddt ttdttdttdtttd	tt ttdt tg} | D ]f}t	|}t
|}t||jkst| | ksttrt|t rqjqjt|t|t qjd S )
Nr   r   T)r'  r[   r`   rR   )r_   r<   )r   r   r   r*   r   r   r&   r   pickledumpsloadstype	__class__r4   
get_paramsr   r   r   r   )Z	instancesorigr   copyr    r    r!   test_pickling_vectorizer  s0    


rV  factoryc                 C   sB   t  }| |}d}tt|}||}||}||ks>tdS )z_Tokenizers cannot be pickled
    https://github.com/scikit-learn/scikit-learn/issues/12833
    r>   N)r   rN  rP  rO  r4   )rW  vecfunctionrW   Zroundtripped_functionr7   r  r    r    r!   test_pickling_built_processors0  s    rZ  c                  C   s   t jd} t dddddddd	d
g	}tddD ]X}t| j|ddd}t|d}t	t
|}|t |t t| |  q2d S Nr   r   r   r  r   r   r   r  r   r   d   rm   F)sizer%   r   )r   randomRandomStatearrayr  r   choicer   rN  rP  rO  r   r   r   r   )rngvocab_wordsxZ	vocab_setr   unpickled_cvr    r    r!   -test_countvectorizer_vocab_sets_when_picklingE  s.    


 rf  c                  C   s   t jd} t dddddddd	d
g	}tddD ]v}t }| j|ddd}tddD ]}|||| < qVt|d}t	t
|}|t |t t| |  q2d S r[  )r   r^  r_  r`  r  r   ra  r   rN  rP  rO  r   r   r   r   )rb  rc  rd  Z
vocab_dictr   yr   re  r    r    r!   .test_countvectorizer_vocab_dicts_when_picklinga  s4    


 rh  c                  C   s   t  tttdtttdtf} | D ]R}|t }d |_|t }t	|d |t }t
|| t
|| q.d S )NrR   r<   r  )r   r   r   r   r   r&   r   r   r  delattrr   )Zfitted_vectorizersr   Zvect_transformZstop_None_transformZstop_del_transformr    r    r!   test_stop_words_removal~  s    


rj  c                  C   s`   t  t} t | }t|}t|}t||j	ks>t
t||  ||   d S r   )r   r   r   r   r   rN  rO  rP  rQ  rR  r4   r   r   )r   rT  r   rU  r    r    r!   test_pickling_transformer  s    

rk  c                  C   sH   t  t} t | }t }|j|_t||  ||   d S r   )	r   r   r   r   r   r   r   r   r   )r   rT  rU  r    r    r!   test_transformer_idf_setter  s
    rl  c               	   C   s   t dd} | t t | jdd}| j|_t|t | t  t | jdd}d}tj	t
|d | j|_W 5 Q R X d S )NTr   r   r   Fz+`idf_` cannot be set when `user_idf=False`.r   )r   r   r   r   r   r   r   r   rh   ri   r   )rT  rU  r   r    r    r!   test_tfidf_vectorizer_setter  s    

rn  c               	   C   s`   t dd} | t t | jdd}t| j}dg|d  }tt t	|d| W 5 Q R X d S )NTr   rm  r   r\   r   )
r   r   r   r   r   r   rh   ri   r   setattr)r   rU  Zexpected_idf_lenZinvalid_idfr    r    r!   %test_tfidfvectorizer_invalid_idf_attr  s    


rp  c               	   C   s<   dddddg} t | d}tt |g  W 5 Q R X d S )Nr6   r   r   r   r   r   r    r    r!   test_non_unique_vocab  s    
rq  c               	   C   s4   d} t }dd }tj|| d |  W 5 Q R X d S )Nz?np.nan is an invalid document, expected byte or unicode string.c                  S   s   t  } | dtjdg d S )Nhello worldhello hello)r   r   r   nan)Zhvr    r    r!   func  s    z0test_hashingvectorizer_nan_in_docs.<locals>.funcr   )r   rh   ri   )r   	exceptionru  r    r    r!   "test_hashingvectorizer_nan_in_docs  s
    rw  c                  C   sl   t ddd d} | jst| ddg }t| ddddg | ddg }t| ddddg d S )NTF)r'  r   r   rr  rs  r\   r   )r   r'  r4   r   r   r   Zravelr   )r   r   r   r    r    r!   test_tfidfvectorizer_binary  s    
rx  c                  C   s(   t dd} | t t| j| jj d S )NTr   )r   r   r   r   r   r  )r   r    r    r!   test_tfidfvectorizer_export_idf  s    

ry  c                  C   s<   t dgd} t| }| t |t |j| jks8td S )Nr   r   )r   r	   r   r   r   r4   )Z
vect_vocabZvect_vocab_cloner    r    r!   test_vectorizer_vocab_clone  s
    

rz  c              	   C   s   d}|  }t jt|d |d W 5 Q R X t jt|d |d W 5 Q R X |ddg t jt|d |d W 5 Q R X d S )NzBIterable over raw text documents expected, string object received.r   zhello world!	some textzsome other text)rh   ri   r   r   r   r   )r:   r   rX  r    r    r!   &test_vectorizer_string_object_as_input  s    r|  X_dtypec                 C   s2   t jdd| dd}t |}|j|jks.td S N
   i N  *   )r  r5  )r   randr   r   r  r4   )r}  r   ZX_transr    r    r!   test_tfidf_transformer_type   s    r  c                  C   s^   t jddtjdd} t | }t | }t |}t |}t|| |j	|j	ksZt
d S r~  )r   r  r   r+  Z
csc_matrix
csr_matrixr   r   r   r1  r4   )r   ZX_cscZX_csrZX_trans_cscZX_trans_csrr    r    r!   test_tfidf_transformer_sparse  s    


r  z0vectorizer_dtype, output_dtype, warning_expectedTFc              	   C   s   t dddg}t| d}d}|rHtjt|d ||}W 5 Q R X n*t  t	dt ||}W 5 Q R X |j
|kstd S )NnumpyscipyZsklearnr  z'dtype' should be used.r   r   )r   r`  r   rh   r   r   r   r   r   r   r  r4   )Zvectorizer_dtypeZoutput_dtypeZwarning_expectedr   r   Zwarning_msg_matchZX_idfr    r    r!   test_tfidf_vectorizer_type  s    


r  rX  )r]   r\   rM  c              	   C   s   | j }td| d}t| tr2tr2tjdd tjt	|d | 
dg W 5 Q R X tjt	|d | dg W 5 Q R X t| trtjt	|d | dg W 5 Q R X d S )NzInvalid value for ngram_range=z/ lower boundary larger than the upper boundary.*HashingVectorizer is not supported on PyPy)reasonr   zgood news everyone)r`   reescaper   r   r   rh   xfailri   r   r   r   r   )rX  Zinvalid_ranger   r    r    r!   $test_vectorizers_invalid_ngram_range*  s    

r  c                 C   s&   |   }|  }|  }| |||S r   )r   build_tokenizerr   _check_stop_words_consistency)Z	estimatorr   tokenize
preprocessr    r    r!   r  H  s    r  c               
   C   s   d} d|  }t  t t fD ]R}|jddddgd tjt|d |d	g W 5 Q R X |`t	|d
kst
qt  tdt |d	g W 5 Q R X t	|d kst
|jdddddgd tjt|d |d	g W 5 Q R X d S )Nz\['and', 'll', 've'\]z}Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens %s not in stop_words.you'veyouyou'llANDr   r   rr  Fr   Zblah)r   r   r   r   rh   r   r   r   Z_stop_words_idr  r4   r   r   r   )Zlstrr   rX  r    r    r!   'test_vectorizer_stop_words_inconsistentO  s$    
r  c                  C   s`   t jdtjd} tj}| j|| _| j|| _dddd}t | |}||jj	ks\t
dS )z
    Check that CountVectorizer._sort_features preserves the dtype of its sparse
    feature matrix.

    This test is skipped on 32bit platforms, see:
        https://github.com/scikit-learn/scikit-learn/pull/11295
    for more details.
    )rm   rm   r  r   r\   r]   )zscikit-learnrJ   zgreat!N)r   r  r   int64indicesZastypeZindptrr   Z_sort_featuresr  r4   )r   ZINDICES_DTYPEr   ZXsr    r    r!   7test_countvectorizer_sort_features_64bit_sparse_indicesk  s    r  	Estimatorc                 C   s   ddig}|  }t |dks t| dd dgd}t |dksBtt |d ksRt|| G d	d
 d
| }|dgd}t |dkst| dd dgd}t |dkstd S )NrW   r{  Tc                 S   s   | d S NrW   r    rd  r    r    r!   <lambda>      z?test_stop_word_validation_custom_preprocessor.<locals>.<lambda>and)rS   r   r   c                   @   s   e Zd Zdd ZdS )zFtest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimatorc                 S   s   dd S )Nc                 S   s   | d S r  r    r  r    r    r!   r    r  zktest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessor.<locals>.<lambda>r    )selfr    r    r!   r     s    zYtest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessorN)__name__
__module____qualname__r   r    r    r    r!   CustomEstimator  s   r  r   c                 S   s   t d| S )Nz\w{1,})r  compilefindallr2  r    r    r!   r    r  )rT   r   )r  r4   r   )r  r	  rX  r  r    r    r!   -test_stop_word_validation_custom_preprocessor  s    

 r  zinput_type, err_type, err_msgfilenamer3   rL   z$'str' object has no attribute 'read'c              	   C   sR   t | trtrtd dg}tj||d | dd |d| W 5 Q R X d S )Nr  "this is text, not file or filenamer   c                 S   s   |   S r   r'   r  r    r    r!   r    r  z.test_callable_analyzer_error.<locals>.<lambda>r_   rM   )
issubclassr   r   rh   r  ri   r   )r  
input_typeZerr_typer   r	  r    r    r!   test_callable_analyzer_error  s
    
r  )Zmarksr_   c                 C   s
   t | dS )Nr)openr  r    r    r!   r    r  r  c                 C   s   |   S r   )readr  r    r    r!   r    r  r  c              	   C   s6   dg}t ttf | ||d| W 5 Q R X d S )Nr  r  )rh   ri   FileNotFoundErrorAttributeErrorr   )r  r_   r  r	  r    r    r!   &test_callable_analyzer_change_behavior  s    r  c              	   C   sf   dd }t |tr tr td | d}|d tjtdd ||dd		|g W 5 Q R X d S )
Nc                 S   s   t dd S )Ntesting)	Exceptionr  r    r    r!   r_     s    z6test_callable_analyzer_reraise_error.<locals>.analyzerr  zfile.txtzsample content
r  r   rL   r  )
r  r   r   rh   r  joinwriteri   r  r   )Ztmpdirr  r_   fr    r    r!   $test_callable_analyzer_reraise_error  s    


r  zjstop_words, tokenizer, preprocessor, ngram_range, token_pattern,analyzer, unused_name, ovrd_name, ovrd_msgr  r  r9  rc   z'stop_words'
'analyzer'	!= 'word'c                 C   s   |   S r   r'   r   r    r    r!   r    r  z'tokenizer'c                 C   s   |   S r   r'   r   r    r    r!   r    r  \w+rY   'token_pattern'zis not Nonec                 C   s   |   S r   r   r   r    r    r!   r    r  c                 C   s   |   S r   r  r   r    r    r!   r    r  z'preprocessor'zis callabler[   c                 C   s   |   S r   r  r   r    r    r!   r    r  z'ngram_range')	NNNr9  r  rc   r  r  r  c
              	   C   sV   t }
|  }|j||||||d d|||	f }tjt|d ||
 W 5 Q R X d S )N)r   rT   rS   r`   r   r_   z-The parameter %s will not be used since %s %sr   )r   r   rh   r   r   r   )r:   r   rT   rS   r`   r   r_   Zunused_nameZ	ovrd_nameZovrd_msgr   r   r   r    r    r!   test_unused_parameters_warn  s"    Yr  zVectorizer, Xr\   r]   )r   barre   )r   Zbazc                 C   s0   |  }t |drt|| t |dr,td S )NZn_features_in_)r   r4   r   )r:   r   r   r    r    r!   test_n_features_inD  s    	
r  c                  C   s:   t dd} | ddgj}| ddgj}||ks6td S )Nr\   r  ZhelloZworld)r   r   r   r4   )rX  Zvocab1Zvocab2r    r    r!   )test_tie_breaking_sample_order_invarianceS  s    
r  c                  C   s.   t ddd} | dgj}|d dks*td S )Ni@B )r]   re   )r  r`   z22pcs efuturer   )r   r   r  r4   )Zhashingr  r    r    r!   2test_nonnegative_hashing_vectorizer_result_indices\  s    r  c                 C   s   |  }t |drtdS )z0Check that vectorizers do not define set_output.Z
set_outputN)r   r4   )r  r|   r    r    r!   'test_vectorizers_do_not_have_set_outputd  s    r  )rN  r  r   collectionsr   collections.abcr   	functoolsr   ior   r  r   rh   Znumpy.testingr   r   r  r   Zsklearn.baser	   Zsklearn.feature_extraction.textr
   r   r   r   r   r   r   r   Zsklearn.model_selectionr   r   r   Zsklearn.pipeliner   Zsklearn.svmr   Zsklearn.utilsr   Zsklearn.utils._testingr   r   r   r   r   r>  r   r"   r&   r)   r*   r8   r9   markZparametrizerX   ra   rl   r{   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r!  r#  r)  r,  r3  rG  rH  rI  rK  rL  rV  rU   r   r  rZ  rf  rh  rj  rk  rl  rn  rp  rq  rw  rx  ry  rz  r|  r(  r+  r  r  Zint32r  r  r  r  r  r  r  r  r  r  paramr  r  r  r  r  r  r  r    r    r    r!   <module>   s  (
	$
=
g
&G


$'


	

 

	



    
 J 
	
 
