U
    9%eR                     @   sX  d dl Z d dl mZ ddlmZmZmZmZmZmZm	Z	m
Z
mZ d dlmZmZ ddgZG dd deZd	d
e
 de de	 d e_dee ee ee ee ee ee eee eeeeeeeedddZee ee ee ee ee ee eeeeeeeeedddZee ee ee ee ee ee eeeeeeeeedddZdS )    N)Tensor   )		Optimizer_use_grad_for_differentiable
_get_value_dispatch_sqrt_stack_if_compiling_capturable_doc_differentiable_doc_foreach_doc_default_to_fused_or_foreach)ListOptionalNAdamnadamc                       sZ   e Zd Zdddddeee eed	 fd
dZ fddZdd ZedddZ	  Z
S )r   Mb`?g?g+?:0yE>r   Mbp?FN)foreach
capturabledifferentiable)decoupled_weight_decayr   r   r   c                   s   d|kst d| d|ks,t d| d|d   krDdk sXn t d|d  d|d   krpdk sn t d|d  d|kst d	| d|kst d
| t||||||||	|
d	}t || d S )N        zInvalid learning rate: zInvalid epsilon value: r         ?z#Invalid beta parameter at index 0: r   z#Invalid beta parameter at index 1: zInvalid weight_decay value: zInvalid momentum_decay value: )	lrbetasepsweight_decaymomentum_decayr   r   r   r   )
ValueErrordictsuper__init__)selfparamsr   r   r   r   r   r   r   r   r   defaults	__class__ P/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/torch/optim/nadam.pyr#   
   s*       zNAdam.__init__c                    s   t  | | jD ]4}|dd  |dd |dd |dd qt| j }t|dkort	|d d }|s|D ]}t
t|d |d< q|t|dkot	|d d }|s|D ]}t
|d |d< qd S )	Nr   r   Fr   r   r   step
mu_product)r"   __setstate__param_groups
setdefaultliststatevalueslentorchZ	is_tensortensorfloat)r$   r1   groupZstate_valuesZstep_is_tensorsZmu_product_is_tensorr'   r)   r*   r-       s    
zNAdam.__setstate__c           
      C   s  |d D  ]}|j d k	r|| |j jr2td||j  | j| }	t|	dkr|d rptjdtj|j	dnt
d|	d< |d rtjdtj|j	dnt
d	|	d
< tj|tjd|	d< tj|tjd|	d< ||	d  ||	d  ||	d
  ||	d  qd S )Nr%   z'NAdam does not support sparse gradientsr   r   r)   )Zdtypedevicer   r+   r   r,   )Zmemory_formatexp_avg
exp_avg_sq)gradappendZ	is_sparseRuntimeErrorr1   r3   r4   Zzerosr6   r9   r5   ZonesZ
zeros_likeZpreserve_format)
r$   r7   params_with_gradgradsexp_avgsexp_avg_sqsmu_productsstate_stepspr1   r)   r)   r*   _init_group1   s,    


zNAdam._init_groupc                 C   s   |    d}|dk	r.t  | }W 5 Q R X | jD ]}g }g }g }g }g }g }	|d \}
}| |||||||	 t||||||	|
||d |d |d |d |d |d |d	 |d
 d q4|S )zPerforms a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r   r   r   r   r   r   r   )
beta1beta2r   r   r   r   r   r   r   r   )Z _cuda_graph_capture_health_checkr4   Zenable_gradr.   rF   r   )r$   closureZlossr7   r?   r@   rA   rB   rC   rD   rG   rH   r)   r)   r*   r+   Q   s@    

z
NAdam.step)r   r   r   r   r   F)N)__name__
__module____qualname__boolr   r#   r-   rF   r   r+   __classcell__r)   r)   r'   r*   r   	   s$            a  Implements NAdam algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma_t \text{ (lr)}, \: \beta_1,\beta_2 \text{ (betas)},
                \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)}                   \\
            &\hspace{13mm} \: \lambda \text{ (weight decay)}, \:\psi \text{ (momentum decay)}    \\
            &\hspace{13mm} \: \textit{decoupled\_weight\_decay}                                  \\
            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
                v_0 \leftarrow 0 \text{ ( second moment)}                                 \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
            &\hspace{5mm} \theta_t \leftarrow \theta_{t-1}                                       \\
            &\hspace{5mm} \textbf{if} \: \lambda \neq 0                                          \\
            &\hspace{10mm}\textbf{if} \: \textit{decoupled\_weight\_decay}                       \\
            &\hspace{15mm} \theta_t \leftarrow \theta_{t-1} - \gamma \lambda \theta_{t-1}                    \\
            &\hspace{10mm}\textbf{else}                                                          \\
            &\hspace{15mm} g_t \leftarrow g_t + \lambda \theta_{t-1}                             \\
            &\hspace{5mm} \mu_t \leftarrow \beta_1 \big(1 - \frac{1}{2}  0.96^{t \psi} \big)     \\
            &\hspace{5mm} \mu_{t+1} \leftarrow \beta_1 \big(1 - \frac{1}{2} 0.96^{(t+1)\psi}\big)\\
            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
            &\hspace{5mm}\widehat{m_t} \leftarrow \mu_{t+1} m_t/(1-\prod_{i=1}^{t+1}\mu_i)\\[-1.ex]
            & \hspace{11mm} + (1-\mu_t) g_t /(1-\prod_{i=1}^{t} \mu_{i})                         \\
            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
            &\hspace{5mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Incorporating Nesterov Momentum into Adam`_.
    a  
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 2e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        momentum_decay (float, optional): momentum momentum_decay (default: 4e-3)
        decoupled_weight_decay (bool, optional): whether to use decoupled weight
            decay as in AdamW to obtain NAdamW (default: False)
        z	
        z

    .. _Incorporating Nesterov Momentum into Adam:
        https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ
    .. _Decoupled Weight Decay Regularization:
        https://arxiv.org/abs/1711.05101

    F)r%   r@   rA   rB   rC   rD   r   r   r   r   rG   rH   r   r   r   r   c
                C   s   t dd |D stdt dd |D s4td|dkrNt| |	dd\}}|rdtj rdtd	|rxtj sxt}nt}|| ||||||
||||||||	d
 dS )zpFunctional API that performs NAdam algorithm computation.

    See :class:`~torch.optim.NAdam` for details.
    c                 s   s   | ]}t |tjV  qd S N
isinstancer4   r   .0tr)   r)   r*   	<genexpr>   s     znadam.<locals>.<genexpr>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsc                 s   s   | ]}t |tjV  qd S rO   rP   rR   r)   r)   r*   rU      s     zPAPI has changed, `mu_products` argument must contain a list of singleton tensorsNF)Z	use_fusedz6torch.jit.script not supported with foreach optimizers)	rG   rH   r   r   r   r   r   r   r   )allr>   r   r4   ZjitZis_scripting_multi_tensor_nadam_single_tensor_nadam)r%   r@   rA   rB   rC   rD   r   r   r   r   rG   rH   r   r   r   r   _funcr)   r)   r*   r      s6    )r%   r@   rA   rB   rC   rD   rG   rH   r   r   r   r   r   r   r   c       	         C   s  t | D ]\}}|| }|| }|| }|| }|| }tj st|rt|jrZ|jrZ|jst|jrl|jrl|jsttd|d7 }|r|}nt|}d||  }|	dkr|r|d||	   n|j	||	d}|ddd||
     }|ddd|d |
     }||9 }|
|d|  ||j||d| d || }|sH|r|	|}|| }|| d|  d|   }|| | d|   }||| ||| qt|| }|| |j||| d|  dt|  d |j||| | d|  d qd S )	NzUIf capturable=True, params, mu_products, and state_steps must be CUDA or XLA tensors.r   r   alphar         ?Q?)value)	enumerater4   _utilsis_compilingis_cudaZis_xlaAssertionErrorr   Zmul_addZlerp_Zaddcmul_divsqrtZaddcdiv_Zadd_)r%   r@   rA   rB   rC   rD   rG   rH   r   r   r   r   r   r   r   iparamr<   r:   r;   r,   Zstep_tr+   Zbias_correction2mumu_nextdenomZmu_product_nextr)   r)   r*   rX      s^    

&rX   c       	             s  t | dkrd S |rtdtj sL|rLtdd t| ||D sLtdt| |||||g}|	 D ]\\}}}}}}}t
|d |	dkr|rt|d|	   ntj|||	d}t||d   t| t|||d  t|}|rt|}td|}t|d	 t
|d
 t|  t
| td|}t|d	 t
|d
 t|  ~t|}t|d
 t| t| n:fdd|D } fdd|D } fdd|D }t|| t|| t
|| ~|rt|d
 t| t|d
}t| t|| |}~t||}t| t|d
 t|| |}~t||}t||| t||| qjtfddt||D }tfddt||D }t|||| t|||| qjd S )Nr   z#_foreach ops don't support autogradc                 s   s&   | ]\}}}|j o|j o|j V  qd S rO   )rc   )rS   rE   mpr+   r)   r)   r*   rU   Y  s   z&_multi_tensor_nadam.<locals>.<genexpr>zNIf capturable=True, params, mu_products, and state_steps must be CUDA tensors.r   r[   r^   g      r   c                    s    g | ]}t d  t|  qS )r   )r   r   rS   r+   )rH   r)   r*   
<listcomp>  s     z'_multi_tensor_nadam.<locals>.<listcomp>c                    s(   g | ] } d ddt |     qS )r   r]   r^   r   rn   rG   r   r)   r*   ro     s     c                    s,   g | ]$} d ddt |d      qS )r   r]   r^   r   rp   rn   rq   r)   r*   ro     s   c                    s,   g | ]$\}} d |  d t |  d qS r   rp   )rS   r,   rj   r   r)   r*   ro     s   c                    s,   g | ]$\}} | d t ||   d qS rr   rp   )rS   r,   rk   rt   r)   r*   ro     s   )r3   rd   r4   ra   rb   rV   zipr   Z"_group_tensors_by_device_and_dtyper2   Z_foreach_add_Z_foreach_mul_Z_foreach_addZ_foreach_lerp_Z_foreach_addcmul_Z_foreach_sqrtZ_foreach_mulZ_foreach_powZ_foreach_sub_Z_foreach_neg_Z_foreach_sqrt_Z_foreach_div_Z_foreach_subZ_foreach_addcdiv_r   ) r%   r@   rA   rB   rC   rD   rG   rH   r   r   r   r   r   r   r   Zgrouped_tensorsZgrouped_paramsZgrouped_gradsZgrouped_exp_avgsZgrouped_exp_avg_sqsZgrouped_mu_productsZgrouped_state_stepsrY   Zexp_avg_sq_sqrtexponentZmusZmu_nextsZbias_correction_sqrtrl   Zstep_size_gradsZstep_size_expavg	numeratorr)   )rG   rH   r   r   r*   rW   A  s    

   


rW   )FNFF)r4   r   Z	optimizerr   r   r   r   r   r	   r
   r   r   typingr   r   __all__r   __doc__rM   r6   r   rX   rW   r)   r)   r)   r*   <module>   s   ,u#D    ;L