U
    Ç9%ezp  ã                   @   s  d dl mZmZmZmZ d dlZd dlmZ ddlmZm	Z	m
Z
mZmZmZmZmZmZmZmZmZ d dlmZ ddgZG d	d„ deƒZd
de› de› de› de› de› d e_dee ee ee ee ee ee ee eeee ee ee eeeeeef eeedœdd„Zee ee ee ee ee ee ee ee eeeeeef eeeeedœdd„Zee ee ee ee ee ee ee ee eeeeeef eeeeedœdd„Zee ee ee ee ee ee ee ee eeeeeef eeeeeddœdd„ZdS )é    )ÚListÚOptionalÚUnionÚTupleN)ÚTensoré   )Ú	OptimizerÚparams_tÚ_use_grad_for_differentiableÚ
_get_valueÚ_stack_if_compilingÚ_dispatch_sqrtÚ_default_to_fused_or_foreachÚ_capturable_docÚ_differentiable_docÚ_foreach_docÚ
_fused_docÚ_maximize_doc)Ú$_get_fused_kernels_supported_devicesÚAdamÚadamc                       s€   e Zd Zdddddddœeeeef eeef eeee	e eeee	e dœ‡ fd	d
„Z
‡ fdd„Zdd„ Zeddd„ƒZ‡  ZS )r   çü©ñÒMbP?©gÍÌÌÌÌÌì?g+‡ÙÎ÷ï?ç:Œ0âŽyE>r   FN)ÚforeachÚmaximizeÚ
capturableÚdifferentiableÚfused)ÚparamsÚlrÚbetasÚepsÚweight_decayÚamsgradr   r   r   r   r   c                   s8  d|kst d|› ƒ‚t|tƒr0|r0|	s0t dƒ‚d|ksFt d|› ƒ‚d|d   kr^dk srn t d|d › ƒ‚d|d   krŠdk sžn t d	|d › ƒ‚d|ks´t d
|› ƒ‚t||||||||	|
|d
}tƒ  ||¡ |r4|
rðtdƒ‚d| _tƒ ‰ t	‡ fdd„| j
D ƒƒs&tdˆ › dƒ‚|r4tdƒ‚d S )Nç        zInvalid learning rate: úElr as a Tensor is not supported for capturable=False and foreach=TruezInvalid epsilon value: r   g      ð?z#Invalid beta parameter at index 0: r   z#Invalid beta parameter at index 1: zInvalid weight_decay value: )
r    r!   r"   r#   r$   r   r   r   r   r   z)`fused` does not support `differentiable`Tc                 3   s2   | ]*}|d  D ]}|j jˆ ko&t |¡V  qqdS )r   N)ÚdeviceÚtypeÚtorchZis_floating_point)Ú.0ZpgÚp©Zfused_supported_devices© úO/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/torch/optim/adam.pyÚ	<genexpr>8   s
    
 ÿz Adam.__init__.<locals>.<genexpr>zX`fused=True` requires all the params to be floating point Tensors of supported devices: Ú.z0`fused` and `foreach` cannot be `True` together.)Ú
ValueErrorÚ
isinstancer   ÚdictÚsuperÚ__init__ÚRuntimeErrorZ_step_supports_amp_scalingr   ÚallÚparam_groups)Úselfr   r    r!   r"   r#   r$   r   r   r   r   r   Údefaults©Ú	__class__r,   r.   r5      sB        ýþ
zAdam.__init__c                    s´   t ƒ  |¡ | jD ]L}| dd¡ | dd¡ | dd ¡ | dd¡ | dd¡ | dd ¡ qt| j ¡ ƒ}t|ƒdkoŠt 	|d d	 ¡}|s°|D ]}t 
t|d	 ƒ¡|d	< q”d S )
Nr$   Fr   r   r   r   r   r   Ústep)r4   Ú__setstate__r8   Ú
setdefaultÚlistÚstateÚvaluesÚlenr)   Ú	is_tensorÚtensorÚfloat)r9   rA   ÚgroupZstate_valuesZstep_is_tensorÚsr;   r-   r.   r>   A   s    
zAdam.__setstate__c           
      C   sZ  |d D ]J}|j d k	r| |¡ |j jr2tdƒ‚| |j ¡ | j| }	t|	ƒdkrÊ|d sd|d rxtjdtj|j	dnt 
d¡|	d	< tj|tjd
|	d< tj|tjd
|	d< |d rÊtj|tjd
|	d< | |	d ¡ | |	d ¡ |d rü| |	d ¡ |d r|	d	 jrtdƒ‚|d rFt |d ¡rF|d sFtdƒ‚| |	d	 ¡ qd S )Nr   zJAdam does not support sparse gradients, please consider SparseAdam insteadr   r   r   r-   )Zdtyper'   r%   r=   )Zmemory_formatÚexp_avgÚ
exp_avg_sqr$   Úmax_exp_avg_sqr   zB`requires_grad` is not supported for `step` in differentiable moder   r    r&   )ÚgradÚappendZ	is_sparser6   rA   rC   r)   ZzerosrF   r'   rE   Z
zeros_likeZpreserve_formatZrequires_gradrD   )
r9   rG   Úparams_with_gradÚgradsÚexp_avgsÚexp_avg_sqsÚmax_exp_avg_sqsÚstate_stepsr+   rA   r-   r-   r.   Ú_init_groupP   s6    



ÿÿý$zAdam._init_groupc                 C   sÚ   |   ¡  d}|dk	r.t ¡  |ƒ }W 5 Q R X | jD ] }g }g }g }g }g }g }	|d \}
}|  |||||||	¡ t||||||	|d |
||d |d |d |d |d |d	 |d
 |d t| ddƒt| ddƒd q4|S )z±Performs a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr!   r$   r    r#   r"   r   r   r   r   r   Ú
grad_scaleÚ	found_inf)r$   Úbeta1Úbeta2r    r#   r"   r   r   r   r   r   rU   rV   )Z _cuda_graph_capture_health_checkr)   Zenable_gradr8   rT   r   Úgetattr)r9   ÚclosureZlossrG   rN   rO   rP   rQ   rR   rS   rW   rX   r-   r-   r.   r=   ‚   sX    

ù	

íz	Adam.step)r   r   r   r   F)N)Ú__name__Ú
__module__Ú__qualname__r	   r   rF   r   r   Úboolr   r5   r>   rT   r
   r=   Ú__classcell__r-   r-   r;   r.   r      s8        úô

ô22a  Implements Adam algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma \text{ (lr)}, \beta_1, \beta_2
                \text{ (betas)},\theta_0 \text{ (params)},f(\theta) \text{ (objective)}          \\
            &\hspace{13mm}      \lambda \text{ (weight decay)},  \: \textit{amsgrad},
                \:\textit{maximize}                                                              \\
            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
                v_0\leftarrow 0 \text{ (second moment)},\: \widehat{v_0}^{max}\leftarrow 0\\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\

            &\hspace{5mm}\textbf{if} \: \textit{maximize}:                                       \\
            &\hspace{10mm}g_t           \leftarrow   -\nabla_{\theta} f_t (\theta_{t-1})         \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})          \\
            &\hspace{5mm}\textbf{if} \: \lambda \neq 0                                           \\
            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
            &\hspace{5mm}\widehat{m_t} \leftarrow   m_t/\big(1-\beta_1^t \big)                   \\
            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
            &\hspace{5mm}\textbf{if} \: amsgrad                                                  \\
            &\hspace{10mm}\widehat{v_t}^{max} \leftarrow \mathrm{max}(\widehat{v_t}^{max},
                \widehat{v_t})                                                                   \\
            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}^{max}} + \epsilon \big)                                 \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Adam: A Method for Stochastic Optimization`_.
    a  
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, Tensor, optional): learning rate (default: 1e-3). A tensor LR
            is not yet supported for all our implementations. Please use a float
            LR if you are not also specifying fused=True or capturable=True.
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (bool, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)
        z	
        zÄ
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ

    F)r   rO   rP   rQ   rR   rS   r   r   r   r   rU   rV   r$   rW   rX   r    r#   r"   r   c                C   sú   |	dkr8|dkr8t | |dd\}}|r8t|tƒr8|s8d}|	dkrDd}	|dkrPd}tj ¡ sttdd„ |D ƒƒsttdƒ‚|rŠtj 	¡ rŠtdƒ‚|	r tj 	¡ r tdƒ‚|	r´tj 	¡ s´t
}n|rÈtj 	¡ sÈt}nt}|| |||||||||||||||
|d	 dS )
zmFunctional API that performs Adam algorithm computation.
    See :class:`~torch.optim.Adam` for details.
    NF)Z	use_fusedc                 s   s   | ]}t |tjƒV  qd S ©N)r2   r)   r   )r*   Útr-   r-   r.   r/   (  s     zadam.<locals>.<genexpr>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsz6torch.jit.script not supported with foreach optimizersz4torch.jit.script not supported with fused optimizers)r$   rW   rX   r    r#   r"   r   r   r   rU   rV   )r   r2   r   r)   Ú_utilsÚis_compilingr7   r6   ÚjitÚis_scriptingÚ_fused_adamÚ_multi_tensor_adamÚ_single_tensor_adam)r   rO   rP   rQ   rR   rS   r   r   r   r   rU   rV   r$   rW   rX   r    r#   r"   r   Ú_Úfuncr-   r-   r.   r   þ   sJ    ð)r   rO   rP   rQ   rR   rS   rU   rV   r$   rW   rX   r    r#   r"   r   r   r   c       	         C   s¤  |d kr|d kst ‚tj ¡ r,t|tƒs,t ‚t| ƒD ]h\}}|sJ|| n||  }|| }|| }|| }tj ¡ sš|rš|j	r†|j	sš|j
r’|j
sšt dƒ‚|d7 }|dkr¸|j||d}t |¡rt |¡}t |¡}t |¡}|røt || ¡||< t |¡}| |d|	 ¡ | |
¡j|| ¡ d|
 d |s<|rð|}d|	|  }d|
|  }|| }| ¡ }| ¡ }|rÈ|rŠ||  ¡ }n|| }||  t ||¡¡ ||  ¡ ||   || ¡}n| ¡ ||   || ¡}| ||¡ nŠt|ƒ}d|	|  }d|
|  }|| }t|ƒ}|rVtj|| ||| d ||  ¡ |  |¡}n| ¡ |  |¡}|j||| d |r4t | | ¡r4t || ¡||< q4d S )NzGIf capturable=True, params and state_steps must be CUDA or XLA tensors.r   r   ©Úalpha)Úvalue)Úout)ÚAssertionErrorr)   rd   re   r2   rF   Ú	enumeraterb   rc   Úis_cudaZis_xlaÚaddÚ
is_complexÚview_as_realZlerp_Zmul_Zaddcmul_ZconjÚnegÚsqrtÚcloneZcopy_ÚmaximumZadd_Zaddcdiv_r   r   Zview_as_complex)r   rO   rP   rQ   rR   rS   rU   rV   r$   rW   rX   r    r#   r"   r   r   r   ÚiÚparamrL   rI   rJ   Zstep_tr=   Úbias_correction1Úbias_correction2Ú	step_sizeZstep_size_negÚbias_correction2_sqrtrK   Údenomr-   r-   r.   rh   J  st    
ÿÿÿÿþ



 rh   c       	            sÐ  t | ƒdkrd S tˆtƒr&|s&tdƒ‚tj ¡ sT|rTtdd„ t| |ƒD ƒƒsTt	dƒ‚|d krd|d ksht	‚|rtt	dƒ‚t
 | |||||g¡}| ¡ D ]6\\}}}}}}}|r¶t |¡}dd„ |D ƒ}d	d„ |D ƒ}d
d„ |D ƒ}dd„ |D ƒ}dd„ |D ƒ}t |d¡ |dkr:|r*tj|||d ntj|||d}t ||dˆ  ¡ t |ˆ¡ t |||dˆ ¡ ~|r2t ˆ |¡}t ˆ|¡}t |d¡ t |d¡ t |¡ t |ˆ¡ t |¡ t |¡ |}|}|rôt ||¡ t |¡}n
t |¡}t ||¡ t ||¡ t ||¡ t |||¡ q’‡ fdd„|D ƒ}‡fdd„|D ƒ}t‡fdd„|D ƒƒ}dd„ |D ƒ}|r˜t ||¡ t |¡}n
t |¡}t ||¡ t ||¡ t ||||¡ q’d S )Nr   r&   c                 s   s   | ]\}}|j o|j V  qd S r`   )rq   )r*   r+   r=   r-   r-   r.   r/   Ó  s     z%_multi_tensor_adam.<locals>.<genexpr>z@If capturable=True, params and state_steps must be CUDA tensors.z#_foreach ops don't support autogradc                 S   s$   g | ]}t  |¡rt  |¡n|‘qS r-   ©r)   rs   rt   ©r*   Úxr-   r-   r.   Ú
<listcomp>é  s     z&_multi_tensor_adam.<locals>.<listcomp>c                 S   s$   g | ]}t  |¡rt  |¡n|‘qS r-   r€   r   r-   r-   r.   rƒ   ê  s     c                 S   s$   g | ]}t  |¡rt  |¡n|‘qS r-   r€   r   r-   r-   r.   rƒ   ë  s     c                 S   s$   g | ]}t  |¡rt  |¡n|‘qS r-   r€   r   r-   r-   r.   rƒ   ì  s     c                 S   s$   g | ]}t  |¡rt  |¡n|‘qS r-   r€   r   r-   r-   r.   rƒ   í  s     r   rk   c                    s   g | ]}d ˆ t |ƒ  ‘qS ©r   ©r   ©r*   r=   )rW   r-   r.   rƒ   '  s     c                    s   g | ]}d ˆ t |ƒ  ‘qS r„   r…   r†   )rX   r-   r.   rƒ   (  s     c                    s   g | ]}ˆ | d  ‘qS )éÿÿÿÿr-   ©r*   Úbc)r    r-   r.   rƒ   *  s     c                 S   s   g | ]}t |ƒ‘qS r-   )r   rˆ   r-   r-   r.   rƒ   ,  s     )rC   r2   r   r6   r)   rb   rc   r7   Úzipro   r   Ú"_group_tensors_by_device_and_dtyperB   Z_foreach_negÚ_foreach_add_Z_foreach_addZ_foreach_lerp_Z_foreach_mul_Z_foreach_addcmul_Z_foreach_powÚ_foreach_sub_Z_foreach_neg_Z_foreach_div_Z_foreach_reciprocal_Z_foreach_sqrt_Z_foreach_maximum_Z_foreach_sqrtZ_foreach_addcdiv_r   )r   rO   rP   rQ   rR   rS   rU   rV   r$   rW   rX   r    r#   r"   r   r   r   Úgrouped_tensorsÚdevice_paramsÚdevice_gradsÚdevice_exp_avgsÚdevice_exp_avg_sqsÚdevice_max_exp_avg_sqsÚdevice_state_stepsri   r{   r|   r}   r~   Zexp_avg_sq_sqrtr-   )rW   rX   r    r.   rg   ¹  sˆ    ÿÿ	ù







rg   )r   rO   rP   rQ   rR   rS   rU   rV   r$   rW   rX   r    r#   r"   r   r   r   Úreturnc       	         C   s‚  | sd S |rt dƒ‚|d k	r&|j|ind }|d k	r<|j|ind }t|tƒrbt|jƒdkrb|j|ind }t | |||||g¡}| ¡ D ]ø\\}}\\}}}}}}}d\}}|d k	rÒ||krÊ|j|dd||< || }|d k	rü||krô|j|dd||< || }|d k	r*||kr*|j|dd||< || }t	 
|d¡ t	j|||||||||	|
|||||d |d k	r„t	 ||gt|ƒ ¡ q„d S )	Nz9Adam with fused=True does not support differentiable=TrueÚcpu)NNT)Únon_blocking)r'   r—   r   )	r$   r    rW   rX   r#   r"   r   rU   rV   )r6   r'   r2   r   Ústrr   r‹   ÚitemsÚtor)   rŒ   Z_fused_adam_r   rC   )r   rO   rP   rQ   rR   rS   rU   rV   r$   rW   rX   r    r#   r"   r   r   r   Zgrad_scale_dictZfound_inf_dictZlr_dictrŽ   r'   ri   r   r   r‘   r’   r“   r”   Zdevice_grad_scaleZdevice_found_infr-   r-   r.   rf   <  sd    &ÿû ñrf   )NFFNNN)Útypingr   r   r   r   r)   r   Z	optimizerr   r	   r
   r   r   r   r   r   r   r   r   r   Ztorch.utils._foreach_utilsr   Ú__all__r   Ú__doc__r^   rF   r   rh   rg   rf   r-   r-   r-   r.   Ú<module>   sÌ   8 /&ñðïîíÚJ      ó
ëL
ïo
ï 
í