U
    ‰dQÛ  ã                   @   sÂ  d dl mZ d dlmZmZ d dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZm Z  d dl!m"Z" e #d	¡ G d
d„ dƒZ$e
 %ej& 'd¡d¡G dd„ de"ƒƒZ(G dd„ de"ƒZ)e
 %e j* d¡e
 %e  +¡ dk d¡G dd„ de"ƒƒƒZ,e
 %e j* d¡e
 %e  -¡ dk d¡G dd„ de"ƒƒƒZ.e
 %e j* d¡e
 %e  -¡ dk d¡G dd„ de"ƒƒƒZ/e0dkr¾d dl
Z
e
 1¡  dS )é    )Úviewkeys)ÚProcessÚQueueN)ÚMock)ÚassumeÚgivenÚsettings)Ú
caffe2_pb2)	ÚbrewÚcoreÚcnnÚdata_parallel_modelÚdyndepÚmodel_helperÚ	optimizerÚrnn_cellÚ	workspace)ÚTestCasez2@/caffe2/caffe2/distributed:file_store_handler_opsc                   @   s   e Zd Zdd„ Zdd„ ZdS )ÚTemporaryDirectoryc                 C   s   t  ¡ | _| jS ©N)ÚtempfileÚmkdtempÚtmpdir©Úself© r   úJ/tmp/pip-unpacked-wheel-ua33x9lu/caffe2/python/data_parallel_model_test.pyÚ	__enter__   s    
zTemporaryDirectory.__enter__c                 C   s   t  | j¡ d S r   )ÚshutilÚrmtreer   )r   ÚtypeÚvalueÚ	tracebackr   r   r   Ú__exit__   s    zTemporaryDirectory.__exit__N)Ú__name__Ú
__module__Ú__qualname__r   r#   r   r   r   r   r      s   r   ZTRAVISz&DPMTest has a known issue with Travis.c                   @   sz  e Zd Zdd„ Zd+dd„Zdd„ Zdd	„ Zd
d„ Ze 	d¡dd„ ƒZ
e 	d¡dd„ ƒZdd„ Zdd„ Zee dd¡e dd¡dedddd„ ƒƒZe ej d¡e e ¡ dk d ¡ee dd¡e dd¡deddd!d"„ ƒƒƒƒZd#d$„ Zee dd¡e dd¡deddd%d&„ ƒƒZe ej d¡e e ¡ dk d ¡ee dd¡e dd¡deddd'd(„ ƒƒƒƒZd)d*„ ZdS ),ÚDataParallelModelTestc                 C   s(  dd„ }dd„ }dd„ }t  ¡  tjdd |¡d	}tj|||||| | | d
 t |dg¡ t |¡}|  	t
|ƒd¡ tj d¡ d}tddƒD ]|}	tj |d¡}
t |
dd…df ¡}|t
|ƒ }t|ƒD ]”\}}|| }|| }|
||…dd…f  tj¡}|||…  tj¡}t t |j|¡¡2 t  d |j|¡|¡ t  d |j|¡|¡ W 5 Q R X qÔ|	dkrŒt  |j¡ t  |j¡ t j|jd t |	d g¡ tj¡t |jd¡d t  |j  ¡ j!¡ |j"D ]:}t  #|jd |¡ ¡d }|  $t%||	d  ƒdk ¡ qØq–t  #d |j¡¡S )ú0
        Helper function for test_equiv
        c                 S   s   d S r   r   ©Úmodelr   r   r   Úinput_builder_fun.   s    z:DataParallelModelTest.run_model.<locals>.input_builder_func              	   S   sz   |   dddddi fdi f¡}|  |d¡}|  |d¡}|  |dgd	¡}|  |d
¡}| j||d}| jjg dgdgd |gS )NÚdataÚfcé   é   ÚConstantFillÚfc_flÚsigmÚlabelÚsqÚloss©ÚscaleÚsync_num©Úshape)ÚFCÚFlattenToVecÚSigmoidÚSquaredL2DistanceÚAveragedLossÚScaleÚparam_init_netÚUniformFill)r*   Ú
loss_scaler-   r1   r2   r4   r5   r   r   r   Úmodel_build_fun1   s     ÿz8DataParallelModelTest.run_model.<locals>.model_build_func                 S   s   t j| dddddS )Nçš™™™™™¹?Úfixedg      @T)ÚpolicyZmax_gradient_normZallow_lr_injection©r   Z	build_sgdr)   r   r   r   Úadd_optimizer>   s    ûz6DataParallelModelTest.run_model.<locals>.add_optimizerÚNHWCztest{}©ÚorderÚname©r+   Úforward_pass_builder_funÚoptimizer_builder_funÚdevicesÚ
cpu_deviceZshared_modelÚcombine_spatial_bnr8   r   é+
  é@   é
   r.   Nú
{}_{}/dataú{}_{}/labelz_0/sync_numé   ©Údevice_optionz_{}/sync_numg{®Gáz„?ú	{}_0/fc_w)&r   ÚResetWorkspacer   ÚCNNModelHelperÚformatr   ÚParallelizeZAddBlobSyncZGetLearningRateBlobNamesZassertGreaterÚlenÚnpÚrandomÚseedÚrangeÚrandÚroundÚ	enumerateÚastypeÚfloat32r   ÚDeviceScopeÚDeviceOptionÚ_device_typeÚFeedBlobÚ_device_prefixÚ
RunNetOncerA   Ú	CreateNetÚnetÚarrayÚRunNetÚProtorM   Ú_devicesÚ	FetchBlobÚ
assertTrueÚabs)r   rQ   Úgpur+   rD   rI   r*   Zlr_namesÚ
batch_sizeÚiÚ	full_dataÚfull_labelsÚbatch_per_deviceÚjÚgÚstÚenr,   ÚlabelsÚsyncr   r   r   Ú	run_model*   st    	þø

 ÿ ÿ
ý
ÿÿ zDataParallelModelTest.run_modelNc                    s’   t ƒ ‰‡ ‡‡fdd„}g }t|d ƒD ]*}||d< t||d}| ¡  | |¡ q&t|ƒdkrŽ| d¡}| ¡ rR| d¡ ˆ 	¡ shˆ 
¡ ‚qhqRd S )Nc               
      sz   zHˆ d krˆ| |Ž t  ¡  n(t ˆ ¡ ˆ| |Ž t  ¡  W 5 Q R X W n, tk
rt } zˆ |¡ W 5 d }~X Y nX d S r   )r   r]   r   rk   Ú	ExceptionÚput)ÚargsÚkwargsÚex©r[   ÚfnÚqueuer   r   Úrun_fn‰   s    


z6DataParallelModelTest.run_test_locally.<locals>.run_fnÚ	comm_sizeÚ	comm_rank)ÚtargetrŠ   r   r/   )r   re   r   ÚstartÚappendra   ÚpopÚis_aliveÚjoinÚemptyÚget)r   r   r[   rŠ   r   Zprocsr|   Úprocr   rŒ   r   Úrun_test_locally„   s"    þ

z&DataParallelModelTest.run_test_locallyc                 C   s  dD ]ø}|r t jrt  ¡ dk r q| jddg|d}| jdg|d}|  t ||¡¡ |rdt  ¡ dkrŒ| jttdƒƒ|d}|  t ||¡¡ |rœt  ¡ dkrÄ| jttdƒƒ|d}|  t ||¡¡ |rÔt  ¡ dkr| jttdƒƒ|d}|  t ||¡¡ qd	S )
z
        Test that the model produces exactly same results given
        total batchsize, independent of number of GPUs.
        ©TFrY   r   r/   )rz   é   é   r.   N)	r   Úhas_gpu_supportÚNumCudaDevicesr†   rx   rb   ÚallcloseÚlistre   )r   rz   Úresult_2gpusÚresult_1gpusÚresult_4gpusÚresult_8gpusZresult_16gpusr   r   r   Ú
test_equiv¬   s"    

ÿz DataParallelModelTest.test_equivc                 C   s  dd„ }dd„ }dd„ }t jddd	}tj||||d
ddgd t |¡}| d¡D ]$}|  ||k¡ |  |d |k¡ qT| d¡D ]}|  ||k¡ q„|  t 	d¡|k¡ | 
d¡D ]}|  ||k¡ q¶| 
d¡D ]}|  ||k¡ qÔ|  t 	d¡|k¡ |  t 	d¡|k¡ d S )Nc                 S   s   d S r   r   r)   r   r   r   Úadd_input_opsÇ   s    zCDataParallelModelTest.test_checkpoint_params.<locals>.add_input_opsc                 S   s¾   |   dd¡ | jdddddi fdddd	d
	 | jdddddd |  dd¡ | jddddd | jddddd |  dd¡ |  dd¡ |  ddgd¡ |  	dd¡}| j
jg dgdd |gS )Nr,   Ú	data_nchwÚconv1é   rU   ÚMSRAFillé   rY   r   ©Zweight_initÚkernelÚstrideÚpadZno_biasÚconv1_spatbn_reluçü©ñÒMbP?F©ÚepsilonZis_testÚpool1©r¯   r°   r-   é  éd   ©Údim_inÚdim_outÚfc_sigmÚsoftmaxr3   Zxentr5   Úfc_w)r¸   iè  r9   )Ú	NHWC2NCHWÚConvÚ	SpatialBNÚReluÚMaxPoolr;   r=   ÚSoftmaxZLabelCrossEntropyr?   rA   r0   ©r*   rC   r5   r   r   r   Úadd_model_opsÊ   s,       þ  ÿzCDataParallelModelTest.test_checkpoint_params.<locals>.add_model_opsc                 S   s   t j| dddd d S ©NrE   rF   gÍÌÌÌÌÌì?)rG   ÚmomentumrH   r)   r   r   r   rI   Þ   s    zCDataParallelModelTest.test_checkpoint_params.<locals>.add_optimizerrJ   ÚtestrK   r/   rY   r«   ©r+   rO   rP   rQ   zcpu_1/Ú	_momentumzcpu_2/zcpu_1/fc_w_momentumz
cpu_1/dataZoptimizer_iteration)r   r^   r   ÚParallelize_CPUZGetCheckpointParamsÚ	GetParamsrx   ÚassertFalser   ZBlobReferenceZGetComputedParams)r   r¨   rÇ   rI   r*   Zcheckpoint_paramsÚpÚcr   r   r   Útest_checkpoint_paramsÆ   s:    þû	
ÿz,DataParallelModelTest.test_checkpoint_paramsc                    s®   t  ¡ ‰ tjˆ ddddd}tjˆ |dddd}tjˆ |dddd dd	„ }‡ fd
d„}dd„ }tjddd}tj||||tdƒd t	 
|j¡ t	 |j¡ t	 |j¡ d S )Nr,   Z	other_fc1iÛ[ rV   rº   Z	other_fc2Z	other_fc3c                 S   s6   | j jg dgddddgd | j jg dgdgd d S )Nr,   r   éã   r«   r9   r3   )rr   rB   r)   r   r   r   r¨     s    zODataParallelModelTest.test_net_conversion_and_append_net.<locals>.add_input_opsc                    sÆ   |   dd¡ | jdddddi fdddd	d
	 | jdddddd |  dd¡ | jddddd | jddddd t ˆ j¡}| j 	|¡ | j
 	t ˆ j
¡¡ |  dd¡ |  dd¡ |  dd¡}|gS )Nr,   r©   rª   r«   rU   r¬   r­   rY   r   r®   r²   r³   Fr´   r¶   r·   r-   r¸   rV   rº   r½   r¾   r5   )rÀ   rÁ   rÂ   rÃ   rÄ   r;   r   ZConvertNetForDevicerr   Z	AppendNetrA   r=   rÅ   r?   )r*   rC   Z	appendnetr5   ©Úotherr   r   rÇ     s*       þ
ÿzODataParallelModelTest.test_net_conversion_and_append_net.<locals>.add_model_opsc                 S   s   t j| dddd d S rÈ   rH   r)   r   r   r   rI     s    zODataParallelModelTest.test_net_conversion_and_append_net.<locals>.add_optimizerÚNCHWrÊ   rK   r   rË   )r   ÚModelHelperr
   r-   r   r^   r   rÍ   re   r   rp   rA   rq   rr   rt   )r   Úfc1Zfc2r¨   rÇ   rI   r*   r   rÔ   r   Ú"test_net_conversion_and_append_netý   s*    þû	z8DataParallelModelTest.test_net_conversion_and_append_netzTest fails on GPU/REc              	   C   s0   dd„ }t ƒ }| j|dd |d W 5 Q R X d S )Nc           
      S   s–   dd„ }dd„ }dd„ }d}t  tjdg |g|d	¡ t|| |d
d}tjddd}tj||||dddg|d t 	|¡ t
dƒD ]}	t |¡ q‚d S )Nc                 S   s   d S r   r   r)   r   r   r   r¨   4  s    zVDataParallelModelTest.test_synchronization_barrier.<locals>.run.<locals>.add_input_opsc                 S   s   g S r   r   ©r*   rC   r   r   r   rÇ   7  s    zVDataParallelModelTest.test_synchronization_barrier.<locals>.run.<locals>.add_model_opsc                 S   s   d S r   r   r)   r   r   r   rI   :  s    zVDataParallelModelTest.test_synchronization_barrier.<locals>.run.<locals>.add_optimizerÚstore_handlerÚFileStoreHandlerCreate©ÚpathÚGLOO©Z
kv_handlerZshard_idZ
num_shardsZenginerJ   rÊ   rK   r/   rY   r«   )r+   rO   rP   rQ   Ú
rendezvous)r   ÚRunOperatorOncer   ÚCreateOperatorÚdictr   r^   r   rÍ   Ú
RunInitNetre   ZSynchronize)
r‘   r   r   r¨   rÇ   rI   rÛ   rá   r*   Ú_r   r   r   Úrun3  sB    üÿüþú
z?DataParallelModelTest.test_synchronization_barrier.<locals>.runrY   ©r   r[   r   ©r   r›   ©r   rç   r   r   r   r   Útest_synchronization_barrier1  s    )üz2DataParallelModelTest.test_synchronization_barrierc              	   C   s0   dd„ }t ƒ }| j|dd |d W 5 Q R X d S )Nc           	   	   S   sº   dd„ }dd„ }dd„ }t  ¡  d}t  tjdg |g|d	¡ t|| |d
d}tjddd}dt_	tj
||||dddg|dd t |¡ t |d¡ | dkrªt tj	¡ t |d¡ d S )Nc                 S   s   d S r   r   r)   r   r   r   r¨   f  s    z`DataParallelModelTest.test_pre_train_synchronization_barrier.<locals>.run.<locals>.add_input_opsc                 S   s   g S r   r   rÚ   r   r   r   rÇ   i  s    z`DataParallelModelTest.test_pre_train_synchronization_barrier.<locals>.run.<locals>.add_model_opsc                 S   s   d S r   r   r)   r   r   r   rI   l  s    z`DataParallelModelTest.test_pre_train_synchronization_barrier.<locals>.run.<locals>.add_optimizerrÛ   rÜ   rÝ   rß   rà   rJ   rÊ   rK   rY   r/   r«   é   )r+   rO   rP   rQ   rá   Zbarrier_net_timeout_secr   )r   r]   râ   r   rã   rä   r   r^   r   Z_DEFAULT_TIMEOUT_SECrÍ   rå   rt   ÚtimeÚsleep)	r‘   r   r   r¨   rÇ   rI   rÛ   rá   r*   r   r   r   rç   e  sL    üÿüþù	
zIDataParallelModelTest.test_pre_train_synchronization_barrier.<locals>.runrY   rè   ré   rê   r   r   r   Ú&test_pre_train_synchronization_barrierc  s    0üz<DataParallelModelTest.test_pre_train_synchronization_barrierc              
   C   sH   |   t¡4 t t tjd¡¡ t d d d ¡ W 5 Q R X W 5 Q R X d S )Nr   )	ÚassertRaisesÚAssertionErrorr   rk   rl   r   ÚGpuDeviceTyper   ÚParallelize_GPUr   r   r   r   Útest_device_scope_checkœ  s    z-DataParallelModelTest.test_device_scope_checkc                 C   sÂ   dddg}dd„ }dd„ }dd	„ }|||d
œ}t ƒ }||d< tjddd}tj|f|Ž |  |j¡ |  |jd¡ t ƒ }||d< ||d< tjddd}tj|f|Ž |  |j¡ |  |jd¡ d S )Nr/   rY   r«   c                 S   s   | j jg dgddgd d S )Nr,   é    rž   r9   )rA   rB   r)   r   r   r   r¨   ¤  s    zJDataParallelModelTest.test_net_transformer_function.<locals>.add_input_opsc                 S   s   t  | d¡ d S ©NrE   rH   r)   r   r   r   rI   §  s    zJDataParallelModelTest.test_net_transformer_function.<locals>.add_optimizerc                 S   s   t j| ddddd}|gS )Nr,   rØ   rž   rº   )r
   r-   )r*   rC   rØ   r   r   r   rÇ   ª  s    zJDataParallelModelTest.test_net_transformer_function.<locals>.add_model_ops)r+   rO   rQ   Znet_transformer_funÚrF)rM   Zinit_paramsrP   T)	r   r   r×   r   rÍ   rx   ÚcalledÚassertEqualZ
call_count)r   rQ   r¨   rI   rÇ   rŠ   Z	transformr*   r   r   r   Útest_net_transformer_function¡  s*    
ýz3DataParallelModelTest.test_net_transformer_functionr   iÿÿ  r/   é   )rd   r{   éÐ  ©Údeadlinec                 C   s   |   d||¡ d S )NÚcpu©Ú_bn_check_op_level©r   rd   r{   r   r   r   Ú!test_multi_device_bn_op_level_cpuÄ  s    z7DataParallelModelTest.test_multi_device_bn_op_level_cpuúNo gpu support.rY   úNeed at least 2 GPUs.c                 C   s   |   d||¡ d S )Nrz   r   r  r   r   r   Ú!test_multi_device_bn_op_level_gpuÉ  s    z7DataParallelModelTest.test_multi_device_bn_op_level_gpuc              
      s(  ddg‰d‰d}‡fdd„}‡ ‡fdd„}‡ ‡‡fdd	„}‡‡fd
d„}dd„ }	t j |¡ t ¡  tjddd}
tj|
|||	ˆˆdkddd t 	|
j
¡ t d ˆ¡¡}t d ˆ¡¡}t 	|
j¡ g }ˆD ]0}t d ˆ|¡¡}t j|t jd}| |¡ qÐ||ˆˆ||ˆƒ ||ˆˆ||ƒ dS )a   
        Test multi device batch normalization at the operation level. This is
        done by checking the outputs of batch normalization and its gradient
        operator. We compare values produced with our manually calculated
        batch normalization values and gradients.
        r   r/   r³   c                    sš   t  | ¡}t j|dd}t j|dd}|D ]j}	| |	 }
|
| t  || ¡ }|| | }t d ||	¡¡}t j 	|| ¡t j 	|¡ }ˆ  
|dk ¡ q*d S )Nr   ©Úaxisú{}_{}/bn_outg{®Gázt?)rb   ÚconcatenateÚmeanÚvarÚsqrtr   rw   r_   ÚlinalgÚnormrx   )ÚxrQ   Údevice_typer7   Úbiasrµ   Zx_concatr  r  ÚdeviceÚx_iZx_hatZexpected_outZspatial_outÚ	rel_errorr   r   r   Ú_test_forward_passÛ  s    

ÿ
ÿzDDataParallelModelTest._bn_check_op_level.<locals>._test_forward_passc                    sþ  g }g }g }t |ƒ}tjt d |¡¡tjd}	tjt d |¡¡tjd}
|D ]P}t d ||¡¡}tj|tjd}| |¡ | tjtj|ddtjd¡ qPtj|tjd}|| }|D ]<}tjt d ||¡¡tjd}ˆ 	tj
|g|g|d¡ q¾|D ]6}tj| | |	 |
 ||  dtjd	}| |¡ q tj|dtjd	}|| }|D ]2}t d
 ||¡¡}ˆ 	tj
|g|g|d¡ qV||
 ˆ  }|D ]^}||| ˆ  | | | |	 | |
   }t d ||¡¡}ˆ 	tj
|g|g|d ¡ ¡ qšd S )Nz{}_0/bn_out_sm©Údtypez{}_0/bn_out_sivz{}_{}/bn_out_gradr   r  ú{}_{}/bn_out_b_grad©Zatol)r  r  ú{}_{}/bn_out_s_gradú{}_{}/tanh_grad)ra   rb   rs   r   rw   r_   rj   r”   Úsumrx   ÚiscloseÚall)r  rQ   r  r7   Ú	toleranceZ	dBias_arrZdY_arrZ
dGamma_arrZnum_devicesr  Zinv_varr  ZdY_blobZdYZdBiasZ	dBias_avgZdBiasActualZdGammaZ
dGamma_avgZdGammaActualZscale_inv_varZdXZ	dX_actual)r{   r   r   r   Ú_test_backward_passé  sj    ÿÿÿÿ
ÿ
" ÿÿ ÿ
ÿÿ
ÿzEDataParallelModelTest._bn_check_op_level.<locals>._test_backward_passc                    s<   ˆD ]2}t j ˆ ddd¡ t j¡}t d ˆ|¡|¡ qd S )Nr/   rW   )rb   rc   rf   ri   rj   r   rn   r_   )r*   r  r,   )r{   r  rQ   r   r   r¨     s    z?DataParallelModelTest._bn_check_op_level.<locals>.add_input_opsc                    s`   ˆ dkr"|   dd¡ |  dd¡ n|  dd¡ | jdddˆdd |  dd	¡ |  d	d
¡}|gS ©Nrz   r,   Zdevice_dataÚtanhZbn_outr/   Fr´   Zsqrr5   ©ÚCopyCPUToGPUZTanhrÂ   ZSqrZSumElementsrÆ   ©r  rµ   r   r   rÇ     s    z?DataParallelModelTest._bn_check_op_level.<locals>.add_model_opsc                 S   s   t  | d¡S rö   rH   r)   r   r   r   rI   (  s    z?DataParallelModelTest._bn_check_op_level.<locals>.add_optimizerrÖ   rÊ   rK   rÿ   FTrN   z{}_0/bn_out_sz{}_0/bn_out_bz
{}_{}/tanhr  N)rb   rc   rd   r   r]   r   r^   r   r`   rp   rA   rw   r_   rr   rs   rj   r”   )r   r  rd   r{   r   r  r!  r¨   rÇ   rI   r*   r7   r  r  r  Zx_blobr  r   )r{   r  rQ   rµ   r   r   r  Ð  sF    /þøz(DataParallelModelTest._bn_check_op_levelc                 C   s&   |d dkr|d7 }|   d||¡ d S )NrY   r/   rÿ   ©Ú_test_multi_device_bn_net_lvlr  r   r   r   Ú test_multi_device_bn_net_lvl_cpuJ  s    z6DataParallelModelTest.test_multi_device_bn_net_lvl_cpuc                 C   s&   |d dkr|d7 }|   d||¡ d S )NrY   r/   rz   r'  r  r   r   r   Ú test_multi_device_bn_net_lvl_gpuQ  s    z6DataParallelModelTest.test_multi_device_bn_net_lvl_gpuc                    sª  ‡fdd„}‡ ‡‡‡fdd„}ddg}d‰d}t j |¡ t j ˆ ddd¡ t j¡‰t  ˆˆ dddf¡‰t ¡  |dd	}t 	|j
¡ t 	|j¡ t d
 ˆ¡¡}	i }
t d ˆ¡¡|
d< t d ˆ¡¡|
d< t d ˆ¡¡|
d< t ¡  |dd	}t 	|j
¡ t 	|j¡ g }i }|D ]x}d ˆ|¡}| t |¡¡ i ||< t d ˆ|¡¡|| d< t d ˆ|¡¡|| d< t d ˆ|¡¡|| d< q||ˆ||	||
|ƒ dS )a	  
        Test multi device batch normalization at the net level. This is done
        by verifying that the final batch normalization outputs and the
        gradient outputs from multiple devices are the same as those produced
        from a single device
        c              	      sÎ   t  |¡}ˆ  t j|g|g|d ¡ ¡ ddg}|D ]<}	||	 }
| D ]*}|| |	 }ˆ  t j|g|
g|d¡ qDq4|d d }|d d }t  ||g¡}|d }
t j ||
 ¡t j |
¡ }ˆ  |dk ¡ d S )Nr  Úbn_out_s_gradÚbn_out_b_gradr   Ú	tanh_gradr/   r³   )rb   r
  rx   r  r  r  r  )rQ   r  r   Úsingle_device_bn_outÚtwo_device_bn_out_valsÚsingle_device_gradsÚtwo_device_gradsZtwo_device_bn_outZgradient_namesrM   Zexpected_gradr  Zactual_gradZ
first_gradZsecond_gradr  r   r   r   Ú_verify_bn_outputse  s,    	
  ÿÿ
ÿzODataParallelModelTest._test_multi_device_bn_net_lvl.<locals>._verify_bn_outputsc           	   
      sˆ   ‡‡fdd„}‡ ‡‡fdd„}‡‡fdd„}dd„ }| rJ|}d	d
g}d}n|}d	g}d}t jddd}tj|||||ˆdkd|d |S )Nc                    s   t  d ˆ¡ˆ ¡ d S )Nú	{}_0/data)r   rn   r_   r)   )r,   r  r   r   Úadd_input_ops_no_combine†  s    zlDataParallelModelTest._test_multi_device_bn_net_lvl.<locals>._create_model.<locals>.add_input_ops_no_combinec                    sD   t ˆ d ƒ}t d ˆ¡ˆd |… ¡ t d ˆ¡ˆ|d … ¡ d S )NrY   r3  z	{}_1/data)Úintr   rn   r_   )r*   Zhalf)r{   r,   r  r   r   Úadd_input_ops_combine‰  s    ziDataParallelModelTest._test_multi_device_bn_net_lvl.<locals>._create_model.<locals>.add_input_ops_combinec                    s`   ˆ dkr"|   dd¡ |  dd¡ n|  dd¡ | jdddˆdd |  dd	¡ |  d	d
¡}|gS r"  r$  rÆ   r&  r   r   rÇ   Ž  s    zaDataParallelModelTest._test_multi_device_bn_net_lvl.<locals>._create_model.<locals>.add_model_opsc                 S   s   t  | d¡S rö   rH   r)   r   r   r   rI   ™  s    zaDataParallelModelTest._test_multi_device_bn_net_lvl.<locals>._create_model.<locals>.add_optimizerr   r/   TFrÖ   rÊ   rK   rÿ   rN   )r   r^   r   r`   )	Úmultiple_devicesr4  r6  rÇ   rI   Z	input_funrQ   rS   r*   )r{   r,   r  rµ   r   r   Ú_create_model…  s4    þø
zJDataParallelModelTest._test_multi_device_bn_net_lvl.<locals>._create_modelr   r/   r³   F)r7  z{}_0/bn_outz{}_0/bn_out_s_gradr+  z{}_0/bn_out_b_gradr,  z{}_0/tanh_gradr-  Tr	  r  r  r  N)rb   rc   rd   rf   ri   rj   Úreshaper   r]   rp   rA   rr   rw   r_   r”   )r   r  rd   r{   r2  r8  rQ   r   Zmodel_no_combiner.  r0  Zmodel_combiner/  r1  r  Zbn_out_blobr   )r{   r,   r  rµ   r   r   r(  Z  sf     /
ÿÿÿ

ÿ
ÿ
ÿùz3DataParallelModelTest._test_multi_device_bn_net_lvl)N)r$   r%   r&   r†   r›   r§   rÒ   rÙ   ÚunittestÚskiprë   rï   rô   rú   r   r‚   Zintegersr   r  ÚskipIfr   rŸ   r    r  r  r)  r*  r(  r   r   r   r   r'   '   s:   Z
(74
1
8#zr'   c                   @   s&   e Zd Zdd„ Ze d¡dd„ ƒZdS )ÚRecurrentNetworkParallelTestc                    sâ  dd„ }‡ fdd„}dd„ }t  ¡  tjd |¡d}d	ˆ _d
ˆ _d	ˆ _dˆ _ˆ jt	|ƒ ˆ _
tj|||||d| d |j ¡ jD ]}|j d¡r„d|_q„tj d¡ tddƒD ]}tj ˆ jˆ jˆ j¡}	tj ˆ jˆ jˆ j¡}
t|ƒD ]¬\}}|ˆ j
 }|ˆ j
 }|	dd…||…dd…f  tj¡}|
dd…||…dd…f  tj¡}t t |j|¡¡2 t  d |j|¡|¡ t  d |j|¡|¡ W 5 Q R X qì|dkr¼t   |j¡ t  !|j"¡ t  #|j" ¡ j$¡ q²t  %d |j¡¡S )r(   c                 S   s   d S r   r   r)   r   r   r   r+   î  s    zARecurrentNetworkParallelTest.run_model.<locals>.input_builder_func              	      s¶   t  t d¡tjˆ jgˆ j tjd¡ | j	j
g dddˆ jˆ jgd | j	j
g dddˆ jˆ jgd tj| ddd	ˆ jˆ jd
d\}}}}|  |  |dgd¡d¡}| j|d|d}|gS )NÚseq_lengthsr  Úhidden_initç        r/   )r!   r:   Ú	cell_initr,   )r?  rA  Zpartest)r*   Z
input_blobr>  Zinitial_statesr»   r¼   Zscoper’   Údistr5   Zloss_scaledr6   )r   rn   r   ZScopedBlobReferencerb   rs   ÚTr   Úint32rA   r0   Ú
hidden_dimr   ZLSTMÚ	input_dimr?   ZSubr@   )r*   rC   ÚoutputZ_last_hiddenræ   Z_last_stater5   r   r   r   rD   ñ  s>    þüüùþz?RecurrentNetworkParallelTest.run_model.<locals>.model_build_func                 S   sŠ   |   d¡}| jj|gdddd}| jjg ddgdd	}|  ¡ D ]"}| j| }|  ||||g|¡ q>t|  ¡ ƒt| j	ƒt| j
ƒ ks†t‚d S ©
NÚITERÚLRçš™™™™™¹¿rF   ©Zbase_lrrG   ÚONEr/   ç      ð?©r:   r!   )ÚIterrr   ÚLearningRaterA   r0   rÎ   Úparam_to_gradÚWeightedSumra   Úparamsrv   rñ   )r*   rI  rJ  rM  ÚparamÚ
param_gradr   r   r   Úparam_update_fun  s"    
ü   ÿ
z@RecurrentNetworkParallelTest.run_model.<locals>.param_update_funzrecurrent_test{})rM   rž   rU   é   T)r+   rO   Úparam_update_builder_funrQ   Zoptimize_gradient_memoryrR   ZFillr0   iÂw3r   rV   NrW   z{}_{}/targetz{}_0/partest/i2h_w)&r   r]   r   r^   r_   rC  r{   rF  rE  ra   r   r   r`   rA   ru   Úopr    Úendswithrb   rc   rd   re   rf   rh   ri   rj   r   rk   rl   rm   rn   ro   rp   rq   rr   rt   rM   rw   )r   rQ   rz   r+   rD   rW  r*   rZ  r|   r}   Zfull_targetr€   r   r‚   rƒ   r,   Útargetsr   r   r   r†   é  sf    $ÿù  ÿ

"" ÿ ÿ
z&RecurrentNetworkParallelTest.run_modelz>Test is flaky: https://github.com/pytorch/pytorch/issues/10322c                 C   s¶   dD ]¬}|rt jsq|  ddg|¡}|  dg|¡}|  t ||¡¡ |rTt  ¡ dkrz|  ttdƒƒ|¡}|  t ||¡¡ |rŠt  ¡ dkr|  ttdƒƒ|¡}|  t ||¡¡ qdS )z†
        Test that the model produces exactly same results given
        total batchsize, independent of number of GPUs/CPUs.
        rœ   r   r/   r   rž   N)	r   rŸ   r†   rx   rb   r¡   r    r¢   re   )r   rz   r£   r¤   r¥   r¦   r   r   r   Útest_equiv_recurrent^  s    
z1RecurrentNetworkParallelTest.test_equiv_recurrentN)r$   r%   r&   r†   r:  r;  r]  r   r   r   r   r=  ç  s   ur=  r  rY   r  c                   @   s(   e Zd ZdZdd„ Zdd„ Zdd„ ZdS )	ÚSparseDataParallelModelTestúe
    Create and run the model. We try with both storing indices for gather
    on CPU and on GPU
    c                    sØ  dd„ }‡ ‡fdd„}‡fdd„}t  ¡  tjdd |¡d	}t d
¡Œ t t t	j
¡¡n | d¡ˆ_|jjˆjgddddˆ_|jjg d|dgdˆ_ˆ r®|j ˆj¡ |jjg ddgddˆ_W 5 Q R X W 5 Q R X tj|||||d ˆ rdt d
¡^ t t t	j
¡¡@ | ¡ D ]0}|j| }	| |ˆj|	j|	jˆjgˆj¡ qW 5 Q R X W 5 Q R X n.t t t jd¡¡ | dˆj¡ W 5 Q R X t j! "d¡ d}
t#ddƒD ]î}t j! $|¡d |
d …  %|
d¡}|d d …df d }|
t&|ƒ }t'|ƒD ]Ä\}}|| }|| }|||…d d …f  (t j)¡}|||…  (t j*¡}t t	j
¡}ˆ sbt t j|¡}t |¡ t  +d |¡|¡ W 5 Q R X t t t j|¡¡ t  +d |¡|¡ W 5 Q R X qú|dkr8t  ,|j¡ t j! -|d¡ (t j*¡}t  +ˆj|¡ ˆ s,|D ]&}t j+d  |¡|t t j|¡d! qt  .|j¡ t  /|j 0¡ j1¡ t&|ƒdkr¬ˆ s¬t  2d"¡}t3| 4¡ ƒ}t&|ƒ}t&t5|ƒƒ}||ks¬t6d#ƒ‚q¬ˆ 7t  8t  2ˆj¡|¡¡ t  2ˆ rÈˆjnd¡t  2d$¡gS )%Nc                 S   s   d S r   r   r)   r   r   r   r+     s    z@SparseDataParallelModelTest.run_model.<locals>.input_builder_func              	      sâ   ˆ rDt  t  tj¡¡ | j ˆjdgd¡}W 5 Q R X |  |d¡}n.| j	 ˆjd¡}| j
 |¡ | j |dgd¡}|  |d¡}|  |dddd	i fd	i f¡}|  |d
¡}|  |d¡}|  |dgd¡}	|  |	d¡}
| j|
|d}
|
gS )NÚindicesÚgathered_cpuÚgatheredZgpuvecsÚ	flattenedr-   é   r/   r0   r1   r2   r3   r4   r5   r6   )r   rk   rl   r	   ÚCPUrr   ÚGatherÚvecsr%  rA   rT  r”   ZFlattenr;   r<   r=   r>   r?   r@   )r*   rC   ra  rb  Úgpu_vecsrc  r-   r1   r2   r4   r5   ©Úcpu_indicesr   r   r   rD   ‚  s0     ÿ ÿ ÿz>SparseDataParallelModelTest.run_model.<locals>.model_build_func                    s¤   | j jg ddgdd}|  ˆ jd¡}|  ¡ D ]r}| j| }t|tjƒs\|  	||||g|¡ q,| j j|g|d dd}| j
j|j||||jg|j||gd	d
d q,d S )NrM  r/   rN  rO  rJ  rÌ   r@  )r!   rE   r   )rÉ   Znesterov)rA   r0   r%  rJ  rÎ   rR  Ú
isinstancer   ÚGradientSlicerS  rr   ZSparseMomentumSGDUpdateÚvaluesr`  )r*   rM  rJ  rU  rV  Zparam_momentumr   r   r   rW  ™  s<       ÿ
ýû  ÿôz?SparseDataParallelModelTest.run_model.<locals>.param_update_funrJ   úsparse_test{}rK   rÿ   rI  rJ  rK  rF   rL  rg  r.   r9   ÚONE_CPUr/   rN  rO  ©r+   rO   rY  rQ   r   zgpu_0/gpuvecsrT   rU   rV   rY   úgpu_{}/indicesúgpu_{}/labelzgpu_{}/gpuvecsrZ   úgpu_0/indicesz We cannot have duplicate indicesz
gpu_0/fc_w)9r   r]   r   r^   r_   r   Ú	NameScoperk   rl   r	   re  rP  rI  rr   rQ  rJ  rA   rB   rg  rT  r”   r0   ro  r   ró   rÎ   rR  ÚScatterWeightedSumr`  rm  rò   ÚCopyGPUToCPUrb   rc   rd   re   Úpermutationr9  ra   rh   ri   rD  rj   rn   rp   rf   rq   rt   ru   rM   rw   r¢   ÚflattenÚsetrñ   rÏ   r¡   )r   ÚVÚgpu_devicesrj  r+   rD   rW  r*   rU  rV  r{   r|   Úfull_indicesr~   r   r€   r   r‚   rƒ   r`  r„   Zdevice_for_indicesÚ	orig_vecsÚidxÚnÚnur   ri  r   r†   }  sÊ    þü  ÿ   ÿû	

ýü ÿ 
þý

ÿÿz%SparseDataParallelModelTest.run_modelc                 C   s  d}|   |ddg|¡}|   |dg|¡}|  t |d |d ¡¡ |  t |d |d ¡¡ t ¡ dkr°|   |ttdƒƒ|¡}|  t |d |d ¡¡ |  t |d |d ¡¡ t ¡ dkr|   |ttdƒƒ|¡}|  t |d |d ¡¡ |  t |d |d ¡¡ dS )z‰
            Test that the model produces exactly same results given
            total batchsize, independent of number of GPUs.
        é'  r   r/   r   rž   N)r†   rx   rb   r¡   r   r    r¢   re   )r   rj  rz  r£   r¤   r¥   r¦   r   r   r   Ú_test_equiv_sparse   s    z.SparseDataParallelModelTest._test_equiv_sparsec                 C   s   |   d¡ |   d¡ d S )NTF)r‚  r   r   r   r   Útest_equiv_sparse6  s    
z-SparseDataParallelModelTest.test_equiv_sparseN)r$   r%   r&   Ú__doc__r†   r‚  rƒ  r   r   r   r   r^  u  s
    $r^  c                   @   sL   e Zd Zdd„ Zdd„ Zdd„ Zdd„ Zee 	¡ d	e
d
ddd„ ƒƒZdS )ÚParallelizeBMUFTestc                 C   s   dd„ }dS )r(   c                 S   s   d S r   r   r)   r   r   r   r+   C  s    z9ParallelizeBMUFTest._run_model.<locals>.input_builder_funNr   )r   r{  r+   r   r   r   Ú
_run_model?  s    zParallelizeBMUFTest._run_modelc              	   C   sd   |  dddddi fdi f¡}| |d¡}| |d¡}| |dgd	¡}| |d
¡}|j||d}|gS )Nr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   )r;   r<   r=   r>   r?   r@   )r   r*   rC   r-   r1   r2   r4   r5   r   r   r   Ú_model_build_funF  s         ÿz$ParallelizeBMUFTest._model_build_func                 C   sf   |  d¡}|jj|gdddd}|jjg ddgdd	}| ¡ D ]"}|j| }| ||||g|¡ q>d S rH  )rP  rr   rQ  rA   r0   rÎ   rR  rS  )r   r*   rI  rJ  rM  rU  Zgradr   r   r   Ú_param_update_funR  s     
ü   ÿ
z%ParallelizeBMUFTest._param_update_func                 C   sì   t j d¡ d}tddƒD ]Ì}t j |d¡}t  |d d …df ¡}|t|ƒ }t|ƒD ]Ž\}	}
|	| }|| }|||…d d …f  t j	¡}|||…  t j	¡}t
 t
 ||
¡¡. t d ||
¡|¡ t d ||
¡|¡ W 5 Q R X qVqd S )Né   rU   r   rV   r.   rW   rX   )rb   rc   rd   re   rf   rg   ra   rh   ri   rj   r   rk   rl   r   rn   r_   )r   rQ   r  Údevice_prefixr{   ræ   r}   r~   r   r€   r   r‚   rƒ   r,   r„   r   r   r   Ú_generate_dataa  s    z"ParallelizeBMUFTest._generate_data)rR   rü   rý   c                 C   s¸  t |ptjptjƒ t ¡  tjddd}ddg}dd„ }|sJtj}d}n
tj	}d	}|  
|||¡ tj||| j| j||d
 t |¡ |  tt|jƒƒddg¡ |  t d |¡¡d¡ tj t d |¡¡t d¡ tj¡ dd¡¡ t |d¡ t d |¡¡}t d |¡¡}t |j¡ t d |¡¡}	t d |¡¡}
t d |¡¡}t d |¡¡}t d |¡¡}t d |¡¡}t |j ¡ |	| d | }|
| d | }t d |¡¡}t d |¡¡}t d |¡¡}t d |¡¡}t d |¡¡}t d |¡¡}t d |¡¡}t d |¡¡}tj |d| | ¡ tj |d| | ¡ tj ||¡ tj ||¡ tj ||¡ tj ||¡ tj ||| ¡ tj ||| ¡ d S )NrJ   rÊ   rK   r   r/   c                 S   s   d S r   r   r)   r   r   r   r+   ‚  s    zDParallelizeBMUFTest.test_parallelize_bmuf.<locals>.input_builder_funrz   rÿ   )rQ   rR   r¿   Zfc_bz{}_0/fc_b_vz{}_0/fc_w_vr.   z	{}_0/fc_br\   z	{}_1/fc_bz	{}_1/fc_wz{}_0/fc_b_gz{}_0/fc_w_grY   g      à?)!r   r   rŸ   Zhas_hip_supportr]   r   r^   rò   r	   re  r‹  r   ZParallelize_BMUFr‡  rˆ  rå   rù   r¢   r   Z_device_grouped_blobsrw   r_   rb   ZtestingZassert_equalÚzerosri   rj   r9  rt   rp   rr   Z_global_model_param_updates_net)r   rR   r*   rQ   r+   r  rŠ  Zv_b_Zv_w_Zb_0_Zw_0_Zb_1_Zw_1_Zb_g_Zw_g_Zg_bZg_wZv_bZv_wZw_gZb_gZw_0Zb_0Zw_1Zb_1r   r   r   Útest_parallelize_bmufs  sz    þú	
 ÿþz)ParallelizeBMUFTest.test_parallelize_bmufN)r$   r%   r&   r†  r‡  rˆ  r‹  r   r‚   Zbooleansr   r  r   r   r   r   r…  ;  s   ÿr…  c                   @   s    e Zd ZdZdd„ Zdd„ ZdS )Ú,SparseDataParallelModelTestWithSharedIndicesr_  c                    s^  dd„ }‡fdd„}‡fdd„}t  ¡  tjdd |¡d	‰d
}|t|ƒ }t d¡š t t 	t
j¡¡| ˆ d¡ˆ_ˆjjˆjgddddˆ_‡ ‡fdd„tdƒD ƒˆ_ˆj ˆjjg d|dgd¡ ˆjjg ddgddˆ_W 5 Q R X W 5 Q R X tjˆ||||d t t 	t jd¡¡6 tˆjd d… ƒD ]\}}	ˆ d |¡|	¡ q2W 5 Q R X tddƒD ]ò}
tj d ¡ tj  ˆ ¡d |…  !|¡}|d d … | }t|ƒD ]†\}}|| }|| }|||…  "tj#¡}|||…  "tj#¡}t t 	t j|¡¡* t  $d! |¡|¡ t  $d" |¡|¡ W 5 Q R X q¨|
dkröt  %ˆj¡ tj &ˆ d¡ "tj'¡tj &ˆ ¡ "tj'¡tj &ˆ d¡ "tj'¡g}t(ˆj|ƒD ]\}	}t  $|	|¡ q|D ]<}t|ƒD ],\}}t j$d# ||¡|t 	t j|¡d$ q¸q¬t  )ˆj¡ t  *ˆj +¡ j,¡ t  -d%¡}d&d„ |D ƒ}|D ]0}t|ƒt|ƒks$t.d' t|ƒt|ƒ¡ƒ‚q$qdd S )(Nc                 S   s   d S r   r   r)   r   r   r   r+   Ù  s    zQSparseDataParallelModelTestWithSharedIndices.run_model.<locals>.input_builder_func                    sú   g }g }t ˆ jƒD ]:\}}| j |d |¡¡}|dkrB| j |¡ | |¡ qt |ƒD ],\}}| j |dgd |¡g¡}| |¡ qVt	|ƒdks”t
‚| j |d |d |d gdg¡}| jj|d	gd
dgdd\}	}
| j|
|d}
| jj|
g dd |
gS )Nz	gpuvec_{}rY   r`  zgpu_vec_gathered_{}r«   r   r/   r-   r3   Zce_lossZavg_lossT)Z	only_lossr6   rV   )Úlimit)rh   rg  rA   r%  r_   rT  r”   rr   rf  ra   rñ   r;   ZSoftmaxWithLossr@   ZPrint)r*   rC   Zgpu_vecs_gatheredrh  ÚnumÚvecZgpu_vecZgpu_vec_gatheredr-   ræ   r5   r   r   r   rD   Ü  s@     ÿ
þýúý
zOSparseDataParallelModelTestWithSharedIndices.run_model.<locals>.model_build_func                    s~   | j jg ddgdd}|  ˆ jd¡}|  ¡ D ]L}| j| }t|tjƒs\|  	||||g|¡ q,| j
 |||j|j|g|¡ q,d S )NrM  r/   rN  rO  rJ  )rA   r0   r%  rJ  rÎ   rR  rk  r   rl  rS  rr   ru  r`  rm  )r*   rM  rJ  rU  rV  r   r   r   rW     s(       ÿ
ûøzPSparseDataParallelModelTestWithSharedIndices.run_model.<locals>.param_update_funrJ   rn  rK   rõ   rÿ   rI  rJ  rK  rF   rL  c                    s(   g | ] }ˆj jg d  |¡ˆ dgd‘qS )zvec_{}r.   r9   )rA   rB   r_   )Ú.0r  )rz  r*   r   r   Ú
<listcomp>,  s   þ  ÿzJSparseDataParallelModelTestWithSharedIndices.run_model.<locals>.<listcomp>rY   Zvec_2r.   r9   ro  r/   rN  rO  rp  r   éÿÿÿÿzgpu_0/gpuvec_{}rV   rT   rq  rr  zgpu_{}/gpuvec_{}rZ   rs  c              	   S   s,   g | ]$}t d ƒD ]}t d ||¡¡‘qqS )rY   zgpu_{}/gpu_vec_gathered_{}_grad)re   r   rw   r_   )r’  r   r  r   r   r   r“  t  s    
 þ
ÿzjNumber of indices {} is not same as number of gradient slices {}. This might lead to illegal memory access)/r   r]   r   r^   r_   ra   r   rt  rk   rl   r	   re  rP  rI  rr   rQ  rJ  re   rg  r”   rA   rB   r0   ro  r   ró   rò   rh   rv  rb   rc   rd   rw  r9  ri   rD  rn   rp   rf   rj   Úziprq   rt   ru   rM   rw   rñ   )r   rz  r{  r+   rD   rW  r{   r   r  r‘  r|   r|  r~   r€   r   r‚   rƒ   r`  r„   r}  Zorig_vecr~  Zgrad_slicesZ
grad_slicer   )rz  r*   r   r   r†   ×  sº    $þüý þÿ   ÿû	 ÿ 
ýþ
 ÿý
ý þÿz6SparseDataParallelModelTestWithSharedIndices.run_modelc                 C   sf   d}|   |ddg¡ |   |dg¡ t ¡ dkrB|   |ttdƒƒ¡ t ¡ dkrb|   |ttdƒƒ¡ dS )z˜
            Test that the model has same number of indices and gradient rows
            given total batchsize, independent of number of GPUs.
        r  r   r/   r   rž   N)r†   r   ÚNumGpuDevicesr¢   re   )r   rz  r   r   r   Útest_sparse_shared_indices_gpu‚  s    zKSparseDataParallelModelTestWithSharedIndices.test_sparse_shared_indices_gpuN)r$   r%   r&   r„  r†   r—  r   r   r   r   rŽ  Ï  s    ,rŽ  Ú__main__)2Zfuture.utilsr   Úmultiprocessingr   r   Znumpyrb   Úosr   r   r:  rí   Zmockr   Z
hypothesisr   r   r   Zhypothesis.strategiesZ
strategiesr‚   Zcaffe2.protor	   Zcaffe2.pythonr
   r   r   r   r   r   r   r   r   Zcaffe2.python.test_utilr   ZInitOpsLibraryr   r<  Úenvironr™   r'   r=  rŸ   r    r^  r–  r…  rŽ  r$   Úmainr   r   r   r   Ú<module>   sN   ,
     D  E  B
