U
    0-eB                     @   s   d dl mZ d dlZd dlm  mZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZmZmZ d
d Zd#ddZd$ddZdd Zdd Zdd Zd%ddZd&ddZdd Z dd Z!d d! Z"e#d"k re!  dS )'    )deepcopyN)AdamW)LambdaLR)
DataLoader)Accelerator)GradientState)RegressionDatasetRegressionModel)DistributedTypeis_torch_versionset_seedc              	   C   s   t |  | D ]~\}}|js"q|s\t|j|jdkstd| d|j d|j dqt|j|jdkstd| d|j d|j dqd S )	NF7Gradients in sync when they should not be at iteration z:
model_a grad (z) == model_b grad ()T7Gradients not in sync when they should be at iteration z) != model_b grad ()zip
parametersrequires_gradtorchallclosegradAssertionError)Zmodel_aZmodel_bdid_step	iterationparamZ
grad_param r   h/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/accelerate/test_utils/scripts/test_sync.pycheck_model_parameters   s    r   Tc                 C   sJ   |    | |}t|||j}|s<||j }|  n
|| d S N)trainFmse_losstodevicegradient_accumulation_stepsbackward)modelinputtargetacceleratorZdo_backwardoutputlossr   r   r   
step_model-   s    

r+   Fc           	      C   s   t d t }t|}tdd}t|dd}|| j |r|t| dd}t| dd}t	|dd	 d
}t	|dd	 d
}|r| 
||||\}}}}n| 
||\}}|r|||||||fS |||fS )z3Returns everything needed to perform basic training*   P   length   Z
batch_sizegMbP?)paramslrc                 S   s   | d S Ng?r   epochr   r   r   <lambda>C       z$get_training_setup.<locals>.<lambda>)Z	lr_lambdac                 S   s   | d S r4   r   r5   r   r   r   r7   D   r8   )r   r	   r   r   r   r!   r"   r   r   r   prepare)	r(   schedr%   	ddp_modelZdset
dataloaderoptddp_opt	ddp_schedr   r   r   get_training_setup8   s"    
r@   c              
   C   s*  t | \}}}tt| \}}tdD ]}| ||f\}}|| j|| j }}t||||  |d dkr| 	| t||||  W 5 Q R X nt||||  t
||d| t| | D ]:\}	}
|	jsqt|	j|
jstd|	j d|
j dqtd|  |tt| }q*d S )	N      r   T7Gradients not in sync when they should be:
Model grad () != DDP grad (r   9  )r@   nextitervaluesrangegatherr!   r"   r+   no_syncr   r   r   r   r   r   r   r   manual_seedrandpermlenr(   r%   r;   r<   	ddp_input
ddp_targetr   r&   r'   r   	ddp_paramr   r   r   test_noop_syncO   s,     rS   c              
   C   sb  t | \}}}tt| \}}tdD ]0}| ||f\}}|| j|| j }}t||||  |d dkr| 	| t||||  W 5 Q R X nt||||  t
| | D ]~\}	}
|	jsq|d dkrt|	j|
jdks:td|	j d|
j dqt|	j|
jdkstd	|	j d
|
j dqtd|  |tt| }q*d S )NrA   rB   r   F7Gradients in sync when they should not be:
Model grad () == DDP grad (r   TrC   rD   rE   )r@   rF   rG   rH   rI   rJ   r!   r"   r+   rK   r   r   r   r   r   r   r   rL   rM   rN   rO   r   r   r   test_distributed_syncq   s0    rV   c              
   C   s  t | \}}}g }d}t|D ]}tt| \}}| ||f\}	}
|	| j|
| j }	}
t||	|
|  | 	|, ||}t
|||j}|| W 5 Q R X qt|D ]}|| }||d k r.| | t| | D ]>\}}|jsqt|j|jdkstd|j d|j dqq| | | | W 5 Q R X t| | D ]F\}}|jstq`t|j|jdks`td|j d	|j dq`qd S )
NrA      FrT   rU   r   TrC   rD   )r@   rI   rF   rG   rH   rJ   r!   r"   r+   rK   r   r    appendr$   r   r   r   r   r   r   r   Ztrigger_sync_in_backward)r(   r%   r;   r<   ZlossesZnum_iterationsr   rP   rQ   r&   r'   Z
ddp_outputr*   r   rR   r   r   r   "test_distributed_sync_multiple_fwd   s@    
rY   c              
   C   sz  t | |dd}t|\}}}t|D ]F\}}| \}}	|||	f\}
}|
|j||j }
}t||
||d || t|||	| W 5 Q R X t	|
 |
 D ]\}}|jsq|d d dks|t|d krt|j|jdksJtd| d|j d	|j d
qt|j|jdkstd| d|j d|j d
qtd|  |tt| }q$t  d S )NrB   split_batchesdispatch_batchesr#   FrW   r   Tr   z:
Model grad (rD   r   r   rU   rE   )r   r@   	enumeraterH   rJ   r!   r"   r+   
accumulater   r   r   rN   r   r   r   r   rL   rM   r   _reset_state)r[   r\   r(   r%   r;   r<   r   batchrP   rQ   r&   r'   r   rR   r   r   r   test_gradient_accumulation   s8      "ra   c              
   C   s  t | |dd}t|d\}}}}}}}	t|D ]\}
}| \}}|||f\}}||j||j }}|  |  t||||d |	  |
d d dks|
d t
|kr| r|	  nt|jD ]}|	  q|  ||, t|||| |	  |		  |  W 5 Q R X |jd d |jd d ksjtd|jd d  d	|jd d  d
|
d d dkp|
d t
|k}|jdkrt||||
 td|
  q.t  d S )NrB   rZ   TFrW   r   r3   z:Learning rates found in each optimizer did not align
opt: z

DDP opt: 
rE   )r   r@   r]   rH   rJ   r!   r"   r   r+   steprN   rI   Znum_processesZ	zero_gradr^   Zparam_groupsr   r   r   rL   r   r_   )r[   r\   r(   r%   r=   r:   r<   r;   r>   r?   r   r`   rP   rQ   r&   r'   _r   r   r   r   1test_gradient_accumulation_with_opt_and_scheduler   sD       

$"re   c                  C   s  t  } tdd}t|dd}tdd}t|dd}| ||\}}| jjd ksRtt|D ]\}}t| jjt|kszt|t	|d k r| jj
rt|dkrt|D ]J\}}t| jjt|kst|t	|d k r| jj
rtq| jj
stqqZ| jj
sZtqZ| jjd kstd S )Nr-   r.   r0   r1   `   rW   )r   r   r   r9   Zgradient_stateZactive_dataloaderr   r]   idrN   Zend_of_dataloader)r(   Z
first_dsetZfirst_dataloaderZsecond_dsetZsecond_dataloaderr   rd   Z	batch_numr   r   r   test_dataloader_break   s&    

rh   c               	   C   s  t  } | j}|jdkrtd t  |jtjkrJ|jdkrBtd t|  |jtj	tj
tjfkr|jdkrrtd t|  |jdkrtd t|  |jtj	tj
fkrdD ]:}dD ]0}|jdkrtdd| d	| d
 t|| qqtdds|jtjkr|jdkrtdd t  |jtj	tj
fkrdD ]P}dD ]D}|sN|sNq:|jdkrrtdd| d	| d
 t|| q:q2d S )Nr   zA**Test `accumulate` gradient accumulation with dataloader break**z'**Test NOOP `no_sync` context manager**z.**Test Distributed `no_sync` context manager**zE**Test Distributed `no_sync` context manager with multiple forwards**)TFz+**Test `accumulate` gradient accumulation, z`split_batches=z` and `dispatch_batches=z`**<z2.0zH**Test `accumulate` gradient accumulation with optimizer and scheduler, z1`split_batches=False`, `dispatch_batches=False`**)r   stateZlocal_process_indexprintrh   Zdistributed_typer
   NOrS   Z	MULTI_GPUZ	MULTI_NPUZ	MULTI_CPUrV   rY   ra   r   re   )r(   rj   Zsplit_batchr\   r   r   r   main9  sV    




rm   c                 C   s
   t   d S r   )rm   )indexr   r   r   _mp_fni  s    ro   __main__)T)F)FF)FF)$copyr   r   Ztorch.nn.functionalnnZ
functionalr   Ztorch.optimr   Ztorch.optim.lr_schedulerr   Ztorch.utils.datar   Zaccelerate.acceleratorr   Zaccelerate.stater   Zaccelerate.test_utilsr   r	   Zaccelerate.utilsr
   r   r   r   r+   r@   rS   rV   rY   ra   re   rh   rm   ro   __name__r   r   r   r   <module>   s,   

"(6
&
+0
