U
    dk3                    @   sJ  d dl Z d dlZd dlmZmZ d dlZd dlmZ d dl	m
Z
mZmZmZ d dlmZ d dlmZ dZedd	d
gZeeZdddgZeeZdd ZG dd deZG dd deZG dd deZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G d d! d!eZ$G d"d# d#eZ%G d$d% d%eZ&G d&d' d'eZ'G d(d) d)eZ(G d*d+ d+eZ)G d,d- d-eZ*d.d/ Z+d[d0d1Z,d2d3 Z-d4d5 Z.d6d7 Z/d\d:d;Z0d<d= Z1d]d>d?Z2d^d@dAZ3dBdC Z4d_dEdFZ5d`dHdIZ6dadJdKZ7dbdLdMZ8dcdNdOZ9dddPdQZ:dedRdSZ;dfdTdUZ<dgdWdXZ=dhdYdZZ>dS )i    N)defaultdict
namedtuple)
caffe2_pb2)corescopeutils	workspace)parameter_info)
basestringlr_injectionAuxOptimizerParamslocalsharedZSIMD_Q_FP16ZSIMD_Q_STOC_FP16ZSIMD_Q_STOC_MKL_FP16c                   C   s   t   dS )z
    This function clears the _optimizer_instance_count. And keeps it
    empty. This functionality is needed in some situations where
    optimizer instance count might not reset even though the workplace is reset.
    N)_optimizer_instance_countclear r   r   ;/tmp/pip-unpacked-wheel-ua33x9lu/caffe2/python/optimizer.pyreset_optimizer_instance_count   s    r   c                   @   s   e Zd Zdd Zd$ddZdd Zd%d	d
Zdd Zedd Z	dd Z
d&ddZdd Zd'ddZdd Zedd Zdd Zd d! Zd"d# ZdS )(	Optimizerc                 C   sF   t g g d| _t| jj | _t| jj  d7  < d | _d | _d| _d S )N)r   r      F)	r   _aux_paramsr   	__class____name___instance_num_lr_multiplier_local_lr_multiplier_local_lr_multiplier_on_gpuselfr   r   r   __init__#   s    zOptimizer.__init__Nc                 C   sh   |d kr2t |tjs"td||jd k	sVtn$t |trFt|}tjd ||d}| 	||| d S )Nz6Expected parameter to be of type ParameterInfo, got {})Zparam_idparamgrad)

isinstancer	   ZParameterInfoAssertionErrorformatr!   r
   r   ZBlobReference_run)r   netparam_init_netr    r!   r   r   r   __call__4   s     

zOptimizer.__call__c                 C   s   t dd S )NzNot Implemented)	Exception)r   r&   r'   
param_infor   r   r   r%   A   s    zOptimizer._run c                 C   s   | j j}d|| j||f S )Nz%s_%d_%s%s_cpur   r   r   )r   base_str	node_name	classnamer   r   r   get_cpu_blob_nameD   s    zOptimizer.get_cpu_blob_namec                 C   s   | j j}d|| j|||f S )Nz%s_%d_%s%s_gpu%dr,   )r   r-   Zgpu_idr.   r/   r   r   r   get_gpu_blob_nameH   s    zOptimizer.get_gpu_blob_namec                 C   s   t | j}|d= |S )Nr   )copydeepcopy__dict__)r   attrr   r   r   
attributesR   s    zOptimizer.attributesc                 C   sJ   t  }|dkr| |S t|jr8| ||j|jS | ||jS dS )zo
        Returns a blob name that will be unique to the current device
        and optimizer instance.
        N)	r   CurrentDeviceScoper0   r   IsGPUDeviceTypedevice_typer1   	device_idr.   )r   r-   current_scoper   r   r   make_unique_blob_nameY   s    
  zOptimizer.make_unique_blob_namefixedr   c                 K   s   |d kr|  d}tj|||d}||sL|j|g|f| |d|}	n
||}	| jd k	r|| j|  d}
|j|	|
g|  ddd}	| j	d k	rt
 }|d k	rt|jr| js|| j	|  d}n| j	}|j|	|g|  d	dd}	|	|fS )
Nlriter_val)Zbase_lrpolicylr_multiplierZ	scaled_lrr   	broadcastlocal_lr_multiplierZlocal_scaled_lr)r<   r   BuildUniqueMutexIterBlobIsDefinedZLearningRateZ
GetBlobRefr   ZCopyFromCPUInputMulr   r   r7   r   r8   r9   r   )r   r&   r'   base_learning_rateZlearning_rate_blobrA   r@   kwargs	iterationr>   rB   r;   rE   r   r   r   build_lri   sV    




 

zOptimizer.build_lrc                 C   s
   || _ dS )z
        Set the global learning rate multiplier. If a multiplier already
        existed, this will overwrite the existing multiplier. The multiplier is
        used for all future calls to _run(), unless it is overwritten.
        N)r   )r   rB   r   r   r   add_lr_multiplier   s    zOptimizer.add_lr_multiplierFc                 C   s   || _ || _dS )a  
        Set the local learning rate multiplier. This local multiplier is
        multiplied with the global learning rate multiplier if it exists. As
        with the global learning rate multiplier, this multiplier will be
        used for all future calls to _run(), so please call
        _clear_local_lr_multiplier() at the beginning of the optimizer's _run()
        before optionally calling this function.
        Nr   r   )r   rE   is_gpu_blobr   r   r   _add_local_lr_multiplier   s    	z"Optimizer._add_local_lr_multiplierc                 C   s   d | _ d| _d S )NFrN   r   r   r   r   _clear_local_lr_multiplier   s    z$Optimizer._clear_local_lr_multiplierc                 C   s4   t |tjstd||r,| j||dS |S d S )Nz,Dedup only works for sparse gradient, got {})Z
aggregator)r"   r   GradientSlicer#   r$   ZDeduplicateGradientSlices)r&   sparse_dedup_aggregatorr!   r   r   r   dedup   s      zOptimizer.dedupc                 C   s   | j S )ax  Returns a list of auxiliary parameters.

        Returns:
            aux_params: A namedtuple, AuxParams.

            aux_params.local stores a list of blobs. Each blob is a local
            auxiliary parameter. A local auxiliary parameter is a parameter in
            parallel to a learning rate parameter. Take adagrad as an example,
            the local auxiliary parameter is the squared sum parameter, because
            every learning rate has a squared sum associated with it.

            aux_params.shared also stores a list of blobs. Each blob is a shared
            auxiliary parameter. A shared auxiliary parameter is a parameter
            that is shared across all the learning rate parameters. Take adam as
            an example, the iteration parameter is a shared parameter, because
            all the learning rates share the same iteration parameter.
        )r   r   r   r   r   get_auxiliary_parameters   s    z"Optimizer.get_auxiliary_parametersc                 O   s   t dd S )Nz9Optimizer Need to Implement `scale_learning_rate` method.)NotImplementedError)r   argsrJ   r   r   r   scale_learning_rate   s    zOptimizer.scale_learning_ratec                 C   sF   |j g ddg|d}|j g ddg|d}|j g ddg|d}|||fS )Nweight_decayr   shapevaluetrustlr_max)ConstantFill)r   r'   rY   r]   r^   wdr   r   r   create_lars_inputs   s       zOptimizer.create_lars_inputs)N)r+   )Nr=   r   )F)r   
__module____qualname__r   r(   r%   r0   r1   propertyr6   r<   rL   rM   rP   rQ   staticmethodrT   rU   rX   ra   r   r   r   r   r   "   s(   



   
=

r   c                       s.   e Zd Zd fdd	Zdd	 Zd
d Z  ZS )SgdOptimizer{Gz?r=           TNc                    s<   t t|   || _|| _|| _|| _|| _|| _|| _	d S N)
superrf   r   rI   rA   momentumnesterovrS   larsinit_kwargs)r   rI   rA   rk   rl   rS   rm   rJ   r   r   r   r      s    
zSgdOptimizer.__init__c                 C   sX  |j }|j}| jdkrd S | jdks4td| j|   | jd k	rt|tj	s| jdksltd| j| 
|ddttjj\}}}|j|||||g| t|d | jdd}	t }
| j|	|
d k	ot|
jd | jrd	nd
}| j||f| j| | jd| j\}}t }|d kr2ttj}|jg d|j|j|j d
gdd}| j!j"#| | jdkr|j|t|d dd}| j!j$#| t|tj	r
| %|| j&|}| jdkr|j'|j(||||j)g|j(||g| j| j*d n|+|||j)|j(|g| nJ| jdkr<|j,||||g|||g| j| j*d n|}|-||||g| d S )Nr   *Expect positive base learning rate, got {}'Lars offset must be nonnegative, got {}rh         ?_larsoffsetZlr_minrO   r   rI   rA   zONE_{}_{}{}rZ   	_momentumr\   rk   rl   ).blobr!   rI   r#   r$   rQ   rm   r"   r   rR   ra   npfinfofloat32maxLarsr<   strr   r7   rP   r8   r9   rk   rL   rA   rn   DeviceOptionr   CPUr_   r:   r.   r   r   appendr   rT   rS   ZSparseMomentumSGDUpdatevaluesindicesrl   ZScatterWeightedSumMomentumSGDUpdateWeightedSum)r   r&   r'   r*   r    r!   r`   r]   r^   lr_lars_multiplierr;   Zlr_signr>   _devONEmomentum_dataZcoeffr   r   r   r%     s    

   




 
 
 
zSgdOptimizer._runc                 C   s   |  j |9  _ d S ri   )rI   r   scaler   r   r   rX   d  s    z SgdOptimizer.scale_learning_rate)rg   r=   rh   TNNr   rb   rc   r   r%   rX   __classcell__r   r   ro   r   rf      s         \rf   c                       s&   e Zd Zd
 fdd	Zdd	 Z  ZS )MultiPrecisionSgdOptimizer皙?rh   r=   TNc                    s&   t t| jf |||||d| d S N)rI   rA   rk   rl   rS   )rj   r   r   )r   rI   rk   rA   rl   rS   rJ   ro   r   r   r   j  s    	z#MultiPrecisionSgdOptimizer.__init__c                 C   s  |j }|jd k	r|jtjj nd }|d kr:t| |||S |j}| jdkrNd S | jdksht	d
| j| j||f| j | jd| j\}}|j|t|d dd}	| jj|	 t|tjrt	d|||d }
|j|
|	||g|
|	|g| j| jd	 ||| d S )
Nr   rp   rx   ry   rh   rz   z3MultiPrecisionSgd does not support sparse gradientsZ_fp32r{   )r|   	blob_copyr   DataTypeFLOATrf   r%   r!   rI   r#   r$   rL   rA   rn   r_   r   r   r   r   r"   rR   ZHalfToFloatr   rk   rl   FloatToHalf)r   r&   r'   r*   r    
param_fp32r!   r>   r   r   Z	grad_fp32r   r   r   r%   |  sV    


 
  
zMultiPrecisionSgdOptimizer._run)r   rh   r=   TNr   rb   rc   r   r%   r   r   r   ro   r   r   i  s        r   c                       s(   e Zd Zd fdd	Zdd
dZ  ZS )FP16SgdOptimizerr   rh   r=   T-C6?Nc                    s,   t t| jf |||||d| || _d S r   )rj   r   r   rY   )r   rI   rk   rA   rl   rY   rS   rJ   ro   r   r   r     s    
zFP16SgdOptimizer.__init__Fc                 C   s  d}t |j}|ddkr d}|r6d}|j}|j}nt|jd krRd}|j}|j}nXtjj|jkrv|j}|jtjj }n4tjj|jkr|jtjj }|j}ntd	|j
 |j}	| jdkrd S | jdkstd	| j| j||f| j | jd| j\}
}|j|t |d	 d
d}||t |d }| jj| t|	tjrNtd|dkr|j|	||
|g|	||g| j| j| jd n(|j|	||
|g|	||g| j| j| jd d S )Nr   Zspatbnrw   Tr   zLUnrecognized parameter format to be updated by FP16 Optimizer. Parameter: {}rp   rx   Z_momentum_fp32rh   rz   ry   z)FP16Sgd does not support sparse gradients)rk   rl   rY   )r   r|   findr   r   r   r   ZFLOAT16r#   r$   namer!   rI   rL   rA   rn   r_   r   r   r   r   r"   rR   ZFP16MomentumSGDUpdaterk   rl   rY   ZFP32MomentumSGDUpdate)r   r&   r'   r*   Zfp32_updateZfp32_update_flag
param_namer    r   r!   r>   r   Zmomentum_data_fp32r   r   r   r   r%     s    




 
  
 

	
zFP16SgdOptimizer._run)r   rh   r=   Tr   N)Fr   r   r   ro   r   r     s         r   c                   @   s   e Zd Zdd Zdd ZdS )WeightDecayBuilderc                 C   s
   || _ d S ri   rY   )r   rY   r   r   r   r     s    zWeightDecayBuilder.__init__c                 C   s   t  }|d krttj}|jg d|j|j	dgdd}|jg d|j|j	dg| j
d}t|jtjrvtdn||j||j|g|j d S )N	ONE_{}_{}r   rr   rZ   zwd_{}_{}z2Weight decay does not yet support sparse gradients)r   r7   r   r   r   r   r_   r$   r9   r:   rY   r"   r!   rR   
ValueErrorr   r|   )r   r&   r'   r*   r   r   ZWDr   r   r   r%     s*       
 zWeightDecayBuilder._runN)r   rb   rc   r   r%   r   r   r   r   r     s   r   c                       sF   e Zd Zd fd
d	Zdd Zdd Zdd Zdd Zdd Z  Z	S )AdagradOptimizerrg   r   r   rh   r=   NFr+   rw   c                    s   t t|   || _|| _|| _t|| _|| _|| _	|| _
|| _|	| _|
| _|| _|| _|| _|| _| | | | | | d S ri   )rj   r   r   alphaepsilondecayfloatrY   rA   rS   rowWiseenginerm   output_effective_lroutput_effective_lr_and_updatecounter_halflifern   weight_scale_process_pruning_options_process_swa_options_process_ema_options)r   r   r   r   rY   rA   rS   r   r   rm   r   r   pruning_optionsswa_optionsema_optionsr   r   rJ   ro   r   r   r   7  s$    


zAdagradOptimizer.__init__c                 C   s^   |rdnd| _ | j rZ|dd | _|dd | _|dd | _|dd | _|dd | _d S )NTFswa_avg_start_itswa_avg_end_itswa_feedback_start_itswa_feedback_stepswa_feedback_end_it)swa_enabledgetr   r   r   r   r   )r   r   r   r   r   r   _  s    z%AdagradOptimizer._process_swa_optionsc                 C   sP   |rdnd| _ | j rL|dd | _|dd | _|dd | _|dd | _d S NTF	ema_startema_endema_step	ema_alphaema_enabledr   r   r   r   r   r   r   r   r   r   r   h  s    z%AdagradOptimizer._process_ema_optionsc                 C   s  d| _ |d kri }nt|ts,td||dd | _|dd | _|dd | _|dd | _	|dg | _
|dg | _|d	d
| _| jd k	rt| jtjkstd| jd kstd| jd kstd| j	d kstdd| _ | jd k	s| jd k	rF| jd k	std| jd k	s,td| jd ks@tdd| _ | j
r| jd k	rpt| j
t| jksxtd| jd kstd| jd kstdd| _ d S )NFzCpruning_options can only be provided as a dictionary, currently: {}mask_tensormask_db_pathmask_db_typemask_blob_nameprune_delaysprune_ratiosprune_block_sizer   z"mask_tensor must be a numpy array!zHmask can be provided through either a numpy array or a db path, not bothTzPwhen mask is provided through db, db path, db type, and blob name are all neededz:Prune Delays and prune ratios should be of the same lengthz,Mask Tensor should be None with prune ratiosz-Mask DB Path should be None with prune ratios)use_maskr"   dictr#   r$   r   r   r   r   r   r   r   r   typer}   Zndarraylen)r   r   r   r   r   r   p  sp    

z)AdagradOptimizer._process_pruning_optionsc               
   C   sf
  |j }|j}| jdkrd S |   | jd k	rt|tjs| jdksJt	d| jdksdt	d
| j| |ddttjj\}}}|j|||||g| t|d | jdd}	t }
| j|	|
d k	ot|
jd | j||f| j| jd	| j\}}|}| jdkr| jj| | jrt !d

| j"| t#$|g\}}t||kr|%|t|d }|j&|gt|d dgdgd}|j'|t|d ddd}n(|j'g t|d |t| d gdd}nt !d
| j"| | j"t(krV| jdkst	d
| j"t#$|g\}}t||ks.t	||t| }|j)g t|d d|d}n|j'|gt|d dd}| j*dkr&| jdkst	d| j+d k	r8t|tjs|j,g t|d g| j+| j+j-d}nh| j+.tj/| _+|j0g t|d g| j+| j+j-d}|j1|tj2j3d}|j'g t|d gdtj2j4dgd}n| j5d k	sP| j6d k	r| j7pbt|d | _7|j8g | j7| j5| j6dd}t|tjr&|j'g t|d gdtj2j4dgd}nr| j9r|j'g t|d  gd!tj2j:dgd}t|tjrt	d" n$|j,g t|d# gg tj2j;dgd$}nt<d%| jj=| | jdkrLt#$|g\}}t||kr|%|t|d }|j&|gt|d dgdgd}|j'|t|d& ddtj2j>d'}|j'|t|d( ddtj2j:d'}n\|j'g t|d& |t| d gdtj2j>d)}|j'g t|d( |t| d gdtj2j:d)}| jj=| | jj=| | jrjt|tjsjt	d*t#$|g\}}|t| }d}t|tjrt?|dkrd}t @d+
t|| n| j}n0t?|dkrd}t Ad,
t|| n| j}t !d-
t||| t|tjr| jBdks4t	d.| C|| jD|}|||jE|jF|g}||g}| jr| j*dkrd/}|dkst	d0
||||g7 }n| jdkr||g7 }d1}n:| j*dkrd2}|dkst	d0
||||g7 }nd3}t !d4
|t| | j9r(|||g7 }|||g7 }|dkr\| jd!kr\|G|||| jH|| j"d5 nR|dkr| jd!kr|G|||| jH|| j"| jd6 n|G|||| jH| j"d7 | jdk	r|jI|||jE|g||g| jd8 n||||g}||g}| jJr4| j*dkst	d9|t|d:  |t|d;  n.| jKrb| j*dksPt	d<|t|d:  | j*dkrx||g7 }| j9r|||g7 }|||g7 }| j*r|dkst	d|jL||| jHtM| jB| jN| j9| jO| j"d= n|dk	r
|jP||| jHtM| jB|| j"d> n|jP||| jHtM| jB| j"d? | jQ	rt|d@ }|R|	sh|j'|g|dd | jj=| |jS|||g||g| jT| jU| jV| jW| jXdA | jY	rt|dB }|R|	s|j'|g|dd | jj=| |jZ|||g||g| j[| j\| j]| j^dC | j_
rb|j`||g|g| j_ja| j_jbtM| j_jcdD | j_jd
rb|j`||g|g| j_ja| j_jbtM| j_jcdD d S )ENr   z,weight decay is not implemented for LARS yetrq   rh   rr   rs   rt   rv   rx   z5Using engine {} for rowWise Adagrad to train param {}Z_shapeZ_numrowsr   )ZstartsZendsZ_avg_squared_sum)input_as_shaper\   rZ   z5Using engine {} for regular Adagrad to train param {}z)weight decay is not tested for engine: {}_squared_sum)r\   r[   rz   Tz0weight decay is not implemented for use_mask yetZ_mask)r   r[   )toZ_mask_changed_blobF)r\   dtyper[   )dbZdb_typeZabsolute_pathZ_last_mask_updated_iterrw   zMPrune Delays and Prune Ratios are currently not supportedfor sparse operatorsZ_empty_mask)r   r   r[   zXIf mask is used, it needs a numpy array or a db file ora delay iter needs to be providedZ_update_counter)r   r\   r   Z_prev_update_iterr[   r\   r   zIf SparseAdagrad with rowWise=True, gradient must be a gradientslice. PLease ensure that rowWise is not enabled for the dense Adagrad optimizer, as it is not supported.z8SKIPPING weight decay on 1d sparse param: {}.shape is {}z7SKIPPING weight decay on 1d dense param: {}.shape is {}z"weight_decay for {} (shape:{}): {}z?Decay is not implemented for SparseAdagrad and must be set to 1ZMaskedRowWiseSparseAdagradz*weight decay is not implemented for {} yetZRowWiseSparseAdagradZMaskedSparseAdagradSparseAdagradzusing {} for {})r   rY   r   )r   rY   r   r   r   r   )r   z@MaskedAdagrad doesn't support outputting effective_lr_and_update_effective_lr_updatez5MaskedAdagrad doesn't support outputting effective_lr)r   r   
block_sizeZdelaysr   r   )r   r   rY   r   r   r   r   Z_swa)Z	avg_startZavg_endZfeedback_startZfeedback_stepZfeedback_end_emar   r   r   r   )stepsizeupper_bound_iterr   )er|   r!   r   rQ   rm   r"   r   rR   rY   r#   r$   ra   r}   r~   r   r   r   r<   r   r   r7   rP   r8   r9   rL   rA   rn   r   r   r   r   r   loggerdebugr   r   InferShapesAndTypesZShapeZSlicer_   FP16_ENGINESZFloat16ConstantFillr   r   ZGivenTensorFillr[   ZastypeZuint8ZGivenTensorBoolFillZCastr   ZUINT8ZBOOLr   r   r   Loadr   INT64r   rV   r   ZDOUBLEr   warnwarningr   rT   rS   r   r   __getattr__r   ZRowWiseCounterr   r   ZMaskedAdagradr   r   r   ZAdagradr   rG   ZSWAr   r   r   r   r   r   EMAr   r   r   r   r   ZWeightScaler   r   r   Zto_aux) r   r&   r'   r*   r    r!   r`   r]   r^   r   r;   r>   Zlr_iterationrK   shapestypesr[   Znum_rowsparam_squared_sumZ	mask_blobZmask_changed_blobZlast_mask_updated_iterZupdate_counterZprev_update_iterr   Zparam_shaperY   Z
input_argsoutput_argsopZ	param_swa	param_emar   r   r   r%     s   
   


  
  

 
 
   
  
  



    
	   


	
	


zAdagradOptimizer._runc                 C   s   |  j |9  _ d S ri   r   r   r   r   r   rX   S  s    z$AdagradOptimizer.scale_learning_rate)rg   r   r   rh   r=   NFr+   NFFNNNNrw   )
r   rb   rc   r   r   r   r   r%   rX   r   r   r   ro   r   r   6  s2                   (	@   &r   c                	       s.   e Zd Zd fdd		Zd
d Zdd Z  ZS )WngradOptimizerrr   &.>r=   Nr+         Y@Fc
                    sN   t t|   || _|| _|| _|| _|| _|| _|| _	|| _
|	| _|
| _d S ri   )rj   r   r   r   r   rA   rS   r   moment_initrm   r   r   rn   )r   r   r   rA   rS   r   r   rm   r   r   rJ   ro   r   r   r   Y  s    zWngradOptimizer.__init__c                 C   s  |j }|j}| jdkrd S |   | jd k	rt|tjs| jdksRtd	| j| 
|ddttjj\}}}|j|||||g| t|d | jdd}	t }
| j|	|
d k	ot|
jd | j||f| j| jd| j\}}|jg t|d	 d
g| jd}| jj| t|tjr\| || j |}|j!|||j"|j#|g||g| j$| j%d nn||g}| j&r|t|d  |t|d  n| j'r|t|d  |j(||||g|| j$| j%d d S )Nr   rq   rh   rr   rs   rt   rv   rx   _momentr   rZ   r   r   r   ))r|   r!   r   rQ   rm   r"   r   rR   r#   r$   ra   r}   r~   r   r   r   r<   r   r   r7   rP   r8   r9   rL   rA   rn   r_   r   r   r   r   rT   rS   ZSparseWngradr   r   r   r   r   r   ZWngrad)r   r&   r'   r*   r    r!   r`   r]   r^   r   r;   r>   r   momentr   r   r   r   r%   r  s~    
   


 
  
zWngradOptimizer._runc                 C   s   |  j |9  _ d S ri   r   r   r   r   r   rX     s    z#WngradOptimizer.scale_learning_rate)	rr   r   r=   Nr+   r   NFFr   r   r   ro   r   r   X  s            Dr   c                       s.   e Zd Zd fdd	Zdd	 Zd
d Z  ZS )StormOptimizerr         $@rg   r=   Nc           	         sB   t t|   || _|| _|| _|| _|| _|| _|| _	|| _
dS )a  Constructor function to add STORM Optimizer

        Args:
            lr: learning rate scaling (called k in the original paper)
            momentum: momentum scaling (called c in the original paper)
            beta: initial value of denominator in adaptive learning rate (
              called c in the original paper)
            grad_sq_init: initial value of gradient squared accumulator.
            policy: specifies how learning rate should be applied, options are
              'fixed', 'step', 'exp', etc.
            sparse_dedup_aggregator: specifies deduplication strategy for
              gradient slices. Works while using sparse gradients. Options
              include 'mean' and 'sum'.
            lars: lars offset.
        N)rj   r   r   r>   rk   betagrad_sq_initrA   rS   rm   rn   )	r   r>   rk   r   r   rA   rS   rm   rJ   ro   r   r   r     s    zStormOptimizer.__init__c                 C   s  |j }|j}| jdkrd S |   | jd k	rt|tjs| jdksRtd	| j| 
|ddttjj\}}}|j|||||g| t|d | jdd}	t }
| j|	|
d k	ot|
jd | j||f| j| jd| j\}}|j|t|d	 dd
}| jj| |jg t|d dg| jd}| jj| t|tjr| || j |}|j!||||j"|j#|g|||g| j$| j%d n&|j&|||||g|||g| j$| j%d d S )Nr   rq   rh   rr   rs   rt   rv   rx   r   rz   Z_grad_sq_sumr   rZ   )rk   r   )'r|   r!   r>   rQ   rm   r"   r   rR   r#   r$   ra   r}   r~   r   r   r   r<   r   r   r7   rP   r8   r9   rL   rA   rn   r_   r   r   r   r   rT   rS   ZSparseStormr   r   rk   r   ZStorm)r   r&   r'   r*   r    r!   r`   r]   r^   r   r;   r>   r   r   Zgrad_sq_sumr   r   r   r%     sv    
   


 
  zStormOptimizer._runc                 C   s   |  j |9  _ d S ri   )r>   r   r   r   r   rX     s    z"StormOptimizer.scale_learning_rate)r   r   r   rg   r=   NNr   r   r   ro   r   r     s          $?r   c                       s.   e Zd Zd fdd	Zd	d
 Zdd Z  ZS )AdadeltaOptimizerrg   r   ffffff?r=   Nr+   c                    s<   t t|   || _|| _|| _|| _|| _|| _|| _	dS )au  Constructor function to add Adadelta Optimizer

        Args:
            alpha: learning rate
            epsilon: attribute of Adadelta to avoid numerical issues
            decay: attribute of Adadelta to decay the squared gradient sum
            policy: specifies how learning rate should be applied, options are
              "fixed", "step", "exp", etc.
            sparse_dedup_aggregator: specifies deduplication strategy for
              gradient slices. Works while using sparse gradients. Options
              include "mean" and "sum".
            engine: the engine used, options include "", "CUDNN", etc.
        N)
rj   r   r   r   r   r   rA   rS   r   rn   )r   r   r   r   rA   rS   r   rJ   ro   r   r   r   $  s    zAdadeltaOptimizer.__init__c           
      C   s
  |j }|j}| jdkrd S | j||f| j| jd| j\}}|j|gt|d dd}|j|gt|d dd}	| jj	
| | jj	
|	 t|tjr| || j|}|j|||	|j|j|g|||	g| j| j| jd n*|j|||	||g|||	g| j| j| jd d S )Nr   rx   Z_squared_momentrh   rz   Z_squared_moment_updater   )r|   r!   r   rL   rA   rn   r_   r   r   r   r   r"   r   rR   rT   rS   ZSparseAdadeltar   r   r   r   r   ZAdadelta)
r   r&   r'   r*   r    r!   r>   r   r   Zmoment_updater   r   r   r%   D  sR    

 
  
 zAdadeltaOptimizer._runc                 C   s   |  j |9  _ d S ri   r   r   r   r   r   rX   p  s    z%AdadeltaOptimizer.scale_learning_rate)rg   r   r   r=   Nr+   r   r   r   ro   r   r   #  s          ,r   c                       s.   e Zd Zd fdd	Zdd	 Zd
d Z  ZS )FtrlOptimizerrg   r   r   Nr+   c                    s6   t t|   || _|| _|| _|| _|| _|| _d S ri   )	rj   r   r   r   r   lambda1lambda2rS   r   r   r   r   r   r   rS   r   ro   r   r   r   v  s    	zFtrlOptimizer.__init__c              	   C   s   |j }|j}| jdkrd S |j|gt|d dgdd}| jj| t|t	j
r| || j|}|j|||j|jg||g| j| j| j| j| jd n,|j|||g||g| j| j| j| j| jd d S )Nr   Z_ftrl_nz   rh   Zextra_shaper\   r   r   r   r   r   )r|   r!   r   r_   r   r   r   r   r"   r   rR   rT   rS   Z
SparseFtrlr   r   r   r   r   r   ZFtrlr   r&   r'   r*   r    r!   Znzr   r   r   r%     s>    
 
  
zFtrlOptimizer._runc                 C   s   |  j |9  _ d S ri   r   r   r   r   r   rX     s    z!FtrlOptimizer.scale_learning_rate)rg   r   r   r   Nr+   r   r   r   ro   r   r   u  s         !r   c                       s2   e Zd ZdZd fdd	Zd	d
 Zdd Z  ZS )GFtrlOptimizerzGroup Lasso FTRL Optimizer.rg   r   r   Nr+   c                    s6   t t|   || _|| _|| _|| _|| _|| _d S ri   )	rj   r  r   r   r   r   r   rS   r   r   ro   r   r   r     s    	zGFtrlOptimizer.__init__c              	   C   sv   |j }|j}| jdkrd S |j|gt|d dgdd}| jj| |j|||g||g| j	| j| j
| j| jd d S )Nr   Z	_gftrl_nzr  rh   r  r  )r|   r!   r   r_   r   r   r   r   ZGFtrlr   r   r   r   r  r   r   r   r%     s(    
 
  zGFtrlOptimizer._runc                 C   s   |  j |9  _ d S ri   r   r   r   r   r   rX     s    z"GFtrlOptimizer.scale_learning_rate)rg   r   r   r   Nr+   r   rb   rc   __doc__r   r%   rX   r   r   r   ro   r   r    s         r  c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )AdamOptimizerMbP??+?:0yE>r=   Frg   TNr+   c                    s   t t|   || _|| _|| _|| _|| _|| _|| _	|| _
|	| _|
| _|| _|| _|r~|
rftd|rrtd|r~td|| _|| _d S )NzZSmart decay is not implemented for rowWise Adam.  Set rowWise or use_smart_decay to False.zWSmart decay is not implemented for RAdam.  Set enableRAdam or use_smart_decay to False.zbSmart decay is not implemented with lr_adaption.  Set use_lr_adaption or use_smart_decay to False.)rj   r  r   r   beta1beta2r   rA   use_lr_adaptionlr_alphanormalized_lr_adaptionrS   r   r   enableRAdamrV   use_smart_decayrn   )r   r   r  r  r   rA   r  r  r  rS   r   r   r  r  rJ   ro   r   r   r     s,    zAdamOptimizer.__init__c              	   C   s  |j }|j}| jdkrd S | j||f| j| jd| j\}}|j|g|d dd}| jrt	|g\}	}
|jg |d |	| d gdd}n|j|g|d dd}| j
rt	|g\}	}|jg |d	 |	| d gdtjjd
}| jj| | jj| | jj| | jj| | jr4t|tjs4td|||g}| j
rP|| | jrnt|d }|| t|tjr| || j|}| jrd}n| j
rd}nd}|dkr||||||j|j||g|| j| j| j| jd n~|dkr*|||||||j|j||g|| j| j| jd n@| jr:td||||||j|j||g|| j| j| jd | jr|j ||j|g|g| j!| j"d nL|j#||||||g|| j| j| jd | jr|j |||g|g| j!| j"d d S )Nr   rx   Z_first_momentrh   rz   Z_avg_second_momentrZ   _second_momentZ
_last_seenr   zIf SparseAdam with rowWise=True, gradient must be a gradientslice. PLease ensure that rowWise is not enabled for the dense Adam optimizer, as it is not supported.Z_effective_gradZRowWiseSparseAdamZSmartDecaySparseAdamZ
SparseAdam)r  r  r   r  )r  r  r   z7Currently, RowWiseSparseAdam is not supported by RAdam!)r  r  )$r|   r!   r   rL   rA   rn   r_   r   r   r   r  r   r   r   r   r   r   r   r"   rR   r#   r  r   rT   rS   r   r   r   r  r  r   r  ZLearningRateAdaptionr  r  ZAdam)r   r&   r'   r*   r    r!   r>   rK   m1r   r   m2r   Z	last_seenoutput_blobsZeffective_gradr   r   r   r   r%     s    

         




	
zAdamOptimizer._runc                 C   s   |  j |9  _ d S ri   r   r   r   r   r   rX   ~  s    z!AdamOptimizer.scale_learning_rate)r	  r
  r  r  r=   Frg   TNFr+   FFr   r   r   ro   r   r    s                 ,vr  c                	       s6   e Zd Zd fd	d
	Zdd Zdd Zdd Z  ZS )DecayAdagradOptimizerrg   rh   r  r   NTr=   r+   c
                    sR   t t|   || _|| _|| _|| _|| _|| _|| _	|	| _
|
| _| | d S ri   )rj   r  r   r   r  r  r   rY   bias_correction_firstrA   r   rn   r   )r   r   r  r  r   rY   r   r  rA   r   rJ   ro   r   r   r     s    zDecayAdagradOptimizer.__init__c                 C   sP   |rdnd| _ | j rL|dd | _|dd | _|dd | _|dd | _d S r   r   r   r   r   r   r     s    z*DecayAdagradOptimizer._process_ema_optionsc              	   C   s  |j }|j}| jdkrd S | j||f| j| jd| j\}}t|tjr|j	|g|d dd}| j
j| ||g}	|j|||j|j|g|	| jd n|j	|g|d dd}
|j	|g|d dd}| j
j| | j
j|
 | j
j| ||
|g}	|j||
||||g|	| j| j| j| j| jd	 | jrt|d
 }||sf|j	|g|dd | j
j| |j|||g||g| j| j| j| jd d S )Nr   rx   r   rh   rz   )r   Z_first_mo1mentr  )r  r  r   rY   r  r   r   )r|   r!   r   rL   rA   rn   r"   r   rR   r_   r   r   r   r   r   r   r   r   ZDecayAdagradr  r  rY   r  r   r   rG   r   r   r   r   r   )r   r&   r'   r*   r    r!   r>   rK   r   r  r  r  r   r   r   r   r%     sd    



zDecayAdagradOptimizer._runc                 C   s   |  j |9  _ d S ri   r   r   r   r   r   rX     s    z)DecayAdagradOptimizer.scale_learning_rate)	rg   rh   r  r   rh   NTr=   r+   )r   rb   rc   r   r   r%   rX   r   r   r   ro   r   r    s            9r  c                       s2   e Zd ZdZd fd
d	Zdd Zdd Z  ZS )YellowFinOptimizerzYellowFin: An automatic tuner for momentum SGD

    See https://arxiv.org/abs/1706.03471 for more details. This implementation
    has separate learning rate and momentum per each parameter.r   rh   r     Tư>r=   Nc	           
         sH   t t|   || _|| _|| _|| _|| _|| _|| _	|| _
|	| _d S ri   )rj   r  r   r   mur   curv_win_widthzero_debiasr   rA   rS   rn   )
r   r   r  r   r  r  r   rA   rS   rJ   ro   r   r   r     s    zYellowFinOptimizer.__init__c                 C   s  d}|j }|j}|j|g|d dd}|jg |d | jgdd}|j|g|d dd}	|j|g|d dd}
|jg |d	 d
g| jd}|jg |d d
g| jd}|jg |d |gdd}| jdkstt|tj	rtdt
j||dd}| jj| | jj| | jj| | jj| | jj| | jj|	 | jj|
 | jj| ||||||	|
|g}|j|||g || j| j| j| jd d S )N   r   rh   rz   Z	_curv_winrZ   Z_g_avgZ_g2_avgZ_lr_avgr   Z_mu_avgZ_scalars_memoryr   z+YellowFin does not support sparse gradientsr?   )r   r   r  r  )r|   r!   r_   r  r   r  r#   r"   r   rR   r   rF   r   r   r   r   Z	YellowFinr   r   r  )r   r&   r'   r*   ZSCALARS_MEMORY_SIZEr    r!   r   Zcurv_winZg_avgZg2_avgZlr_avgZmu_avgZscalars_memoryrK   Zyf_in_out_argsr   r   r   r%     s~                 
zYellowFinOptimizer._runc                 C   s   |  j |9  _ d S ri   r   r   r   r   r   rX   ;  s    z&YellowFinOptimizer.scale_learning_rate)r   rh   r  r  Tr  r=   Nr  r   r   ro   r   r    s           =r  c                       s.   e Zd Zd fdd	Zd	d
 Zdd Z  ZS )RmsPropOptimizerrg   r
  rh   h㈵>r=   r+   c                    s<   t t|   || _|| _|| _|| _|| _|| _|| _	d S ri   )
rj   r!  r   r   r   rk   r   rA   r   rn   )r   r   r   rk   r   rA   r   rJ   ro   r   r   r   A  s    
zRmsPropOptimizer.__init__c                 C   sD  |j }|j}| jdkstt|tjr.tdt }|d krJt	t
j}|jg d|j|jdgdd}| j||f| j | jd| j\}}	|j|gt|d d	d
}
|j|gt|d d	d
}|j|gt|d d	d
}| jj| | jj| |j||||g|
||g| j| j| j| jd ||
|||g|
||g d S )Nr   z1RmsPropOptimizer doesn't support sparse gradientsr   r   rr   rZ   rx   Z_grad_orh   )r   Z_mean_squaresry   )r   rk   r   r   )r|   r!   r   r#   r"   r   rR   r   r7   r   r   r   r_   r$   r9   r:   rL   rA   rn   r   r   r   r   ZRmsPropr   rk   r   r   r   )r   r&   r'   r*   r    r!   r   r   r>   r   Zgrad_omsZmomr   r   r   r%   T  sb        
 
  
 
	zRmsPropOptimizer._runc                 C   s   |  j |9  _ d S ri   r   r   r   r   r   rX     s    z$RmsPropOptimizer.scale_learning_rate)rg   r
  rh   r"  r=   r+   r   r   r   ro   r   r!  @  s         1r!  c                 C   s"   t | j}|t | j |S ri   )r   ZInferBlobDevicesr&   updater'   )modelparam_to_devicer   r   r   _get_param_to_device  s    r'  c                 C   s   |}|p
i }| |kr||  }ndt |tjrj|}t|j|krL|t|j }qt|j|kr|t|j }nt|}||kr|| }|d k	std| |S )Nz,Cannot infer device for {}: no op creates it)r"   r   rR   r   r   r   r#   r$   )r   r!   r&  Zdefault_devicedeviceZ	grad_namer   r   r   get_param_device  s"    
r)  c                   C   s
   t tS )z
    Gets current value for lr_injection, a multiplier for all base
    learning rates.
    Must set allow_lr_injection=True when building optimizer, as it
    relies on synchronization over CPU.
    )r   Z	FetchBlob_LEARNING_RATE_INJECTIONr   r   r   r   get_lr_injection  s    r+  c                 C   s"   t ttjt| gtjd dS )z
    Sets lr_injection, a multiplier for all base learning rates.
    Must set allow_lr_injection=True when building optimizer, as it
    relies on synchronization over CPU.
    )r   N)r   ZFeedBlobr*  r}   arrayr   r   )Zlr_injection_valuer   r   r   set_lr_injection  s    r-  c                 C   s8  t |" g }t|D ]|\}}tt|j|j|}t |P t|jt j	sV|jn|jj
}	d|}
| j|	|
}| j|}|| W 5 Q R X qt t tjz | j|d}| jj|ddd}| jjg dg t|d}| j||gd}| j||gd	}|W  5 Q R  W  5 Q R  S Q R X W 5 Q R X d S )
Nzgrad_{}_squared_sumgrad_squared_full_sumglobal_normg      ?)exponent	clip_normrZ   max_norm
norm_ratio)r   Z	NameScope	enumerater)  r   r|   r!   DeviceScoper"   rR   r   r$   r&   ZSumSqrElementsZEnsureCPUOutputr   r   r   r   ZSumZPowr'   r_   r   ZMaxZDiv)r%  paramsZ
name_scoper&  max_gradient_normZgrad_squared_sumsir    r(  r!   Zgrad_squared_sum_nameZgrad_squared_sumZgrad_squared_sum_cpur.  r/  r1  r2  r3  r   r   r   _calc_norm_ratio  s@    
      r9  FTc              
   C   s  t | }|   g }|  D ] }|r2|j| jkr2q|| qd }	|d k	rZt| |d||}	|r| jt	s| j
jg t	dgdd}
nt	}
|	d kr|
}	n| jj|	|
gddd}	||	 |D ]`}t|j}t||j|}t|4 |jr|r|| j| j
| n|| j| j
| W 5 Q R X q|S )NZnorm_clipped_grad_updater   rr   rZ   rB   rC   )r'  ZValidateZGetOptimizationParamInfor|   weightsr   r9  r&   rG   r*  r'   r_   rH   rM   r   r)  r!   r   r5  	optimizer)r%  r;  weights_onlyuse_param_info_optimr7  allow_lr_injectionr&  r6  r*   rB   r   r   r(  r   r   r   _build  sR         


r?  c                 C   s   t | t|dddd dS )zAdds a decay to weights in the model.

    This is a form of L2 regularization.

    Args:
        weight_decay: strength of the regularization
    r   TF)r<  r=  N)r?  r   )r%  rY   r   r   r   add_weight_decay  s    r@  c                 K   s   t |f|}t| |||dS Nr7  r>  )rf   r?  )r%  rI   r7  r>  rJ   Zsgd_optimizerr   r   r   	build_sgd+  s    rC  c                 K   s   t |f|}t| |||dS rA  )r   r?  )r%  rI   r7  r>  rJ   Zmulti_prec_sgd_optimizerr   r   r   build_multi_precision_sgd;  s    rD  c                 K   s   t |f|}t| |S ri   )r   r?  )r%  rI   rJ   Zfp16_sgd_optimizerr   r   r   build_fp16_sgdK  s    rE  SIMDc                 K   s@   |dkr$t dstt ds$ttf d|i|}t| |S )NrF  ZFtrl_ENGINE_SIMDZSparseFtrl_ENGINE_SIMDr   )r   
IsOperatorr#   r   r?  )r%  r   rJ   Zftrl_optimizerr   r   r   
build_ftrlP  s
    rH  r+   c                 K   s2   |dkrt dsttf d|i|}t| |S )NrF  ZGFtrl_ENGINE_SIMDr   )r   rG  r#   r  r?  )r%  r   rJ   Zgftrl_optimizerr   r   r   build_gftrlX  s    rI  c                 K   s"   t f d|i|}t| |||dS Nr   rB  )r   r?  )r%  rI   
parametersr7  r>  rJ   Zadagrad_optimizerr   r   r   build_adagrad_  s    rL  c                 K   s"   t f d|i|}t| |||dS rJ  )r   r?  )r%  rI   rK  r7  r>  rJ   Zwngrad_optimizerr   r   r   build_wngradp  s    rM  c                 K   s"   t f d|i|}t| |||dS )Nr>   rB  )r   r?  )r%  rI   rK  r7  r>  rJ   Zstorm_optimizerr   r   r   build_storm  s    rN  c                 K   s"   t f d|i|}t| |||dS rJ  )r   r?  )r%  rI   rK  r7  r>  rJ   Zadadelta_optimizerr   r   r   build_adadelta  s    rO  c                 K   s"   t f d|i|}t| |||dS rJ  )r  r?  )r%  rI   r7  r>  rJ   Zadam_optimizerr   r   r   
build_adam  s    rP  c                 K   s"   t f d|i|}t| |||dS rJ  )r  r?  )r%  rI   r7  r>  rJ   Zdecay_adagrad_optimizerr   r   r   build_decay_adagrad  s    rQ  r   c                 K   s   t f d|i|}t| |S )Nr   )r  r?  )r%  rI   rJ   Zyellowfin_optimizerr   r   r   build_yellowfin  s    rR  c                 K   s"   t f d|i|}t| |||dS rJ  )r!  r?  )r%  rI   r7  r>  rJ   Zrms_prop_optimizerr   r   r   build_rms_prop  s    rS  )NN)FTNF)NF)NF)rF  )r+   )NNF)NNF)NNF)NNF)NF)NF)r   )NF)?r2   loggingcollectionsr   r   Znumpyr}   Zcaffe2.protor   Zcaffe2.pythonr   r   r   r   Zcaffe2.python.modelingr	   Zpast.builtinsr
   r*  r   intr   r   	getLoggerr   r   r   objectr   rf   r   r   r   r   r   r   r   r   r  r  r  r  r!  r'  r)  r+  r-  r9  r?  r@  rC  rD  rE  rH  rI  rL  rM  rN  rO  rP  rQ  rR  rS  r   r   r   r   <module>   s   

	 SuFl    &chR8. (__J

$    
8  
  



   
   
   
   
  
  

  