U
    dR                     @   s,  d dl mZmZ d dlZG dd deZG dd deZG dd deZG d	d
 d
eZ	G dd deZ
G dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd  d eZG d!d" d"eZG d#d$ d$eZdS )%    )coreutilsNc                   @   s   e Zd ZdZdZdS )RegularizationByZafter_optimizerZon_lossN)__name__
__module____qualname__ZAFTER_OPTIMIZERZON_LOSS r   r   =/tmp/pip-unpacked-wheel-ua33x9lu/caffe2/python/regularizer.pyr   	   s   r   c                   @   sB   e Zd Zdd ZdddZdddZdd	 Zd
d ZdddZdS )Regularizerc                 C   s
   d| _ d S )Ng&.>)kEpsilon)selfr   r   r	   __init__   s    zRegularizer.__init__Nc                 C   sv   t |tjsttt}|| ks>td| j	|| d| }t
| |sbtd| j	|t| |||||S )Nz>Regularizer of type {} is called with invalid by={}, not in {}Z_run_z5Regularizer of type {} does not implement function {})
isinstancer   ZBlobReferenceAssertionErrorr   ZEnumClassKeyValsr   valuesformat	__class__hasattrgetattr)r   netparam_init_netparamgradZbyZby_enumZrun_funcr   r   r	   __call__   s(    
    zRegularizer.__call__c                 C   s   d S Nr   r   r   r   r   r   r   r   r	   _run_on_loss'   s    zRegularizer._run_on_lossc                 C   s   d S r   r   r   r   r   r	   _run_after_optimizer*   s    z Regularizer._run_after_optimizerc                 C   sL   | ||g|dg}||g|dg}|j|g|dgdd}|S )N	param_mulparam_reducedgrouped_feature_weight_vec      ?exponent)MulNextScopedBlobReduceFrontSumPow)r   r   r   r   r   r    r   r   r	   _feature_grouping-   s     

zRegularizer._feature_groupingFc	           
      C   st   |d k	r|s|r|| j  n|}|d k	r8|s.|r8|| j  n|}t|tjrV||j|jgn|g}	|j|	|g||d d S )N)minmax)r   r   r   GradientSliceindicesr   ZEnsureClipped)
r   r   r   r   r)   r*   
open_range	left_open
right_openZinput_blobsr   r   r	   _ensure_clipped=   s*    
zRegularizer._ensure_clipped)NN)N)NNNFFF)	r   r   r   r   r   r   r   r(   r0   r   r   r   r	   r
      s   	

      r
   c                       s&   e Zd Z fddZdddZ  ZS )L1Normc                    s(   t t|   |dkstd|| _d S Nr   6factor ahead of regularization should be 0 or positive)superr1   r   r   
reg_lambdar   r5   r   r   r	   r   [   s    zL1Norm.__init__Nc                 C   s<   | |d }|j|g|gdd |j|g|g| jd |S )NZ_l1_regularization   pZscaler%   LpNormScaler5   r   r   r   r   r   output_blobr   r   r	   r   a   s    zL1Norm._run_on_loss)Nr   r   r   r   r   __classcell__r   r   r7   r	   r1   Z   s   r1   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	r=   r!   c                    s>   t t|   |dkstd|dks.td|| _|| _dS )a
  
        reg_lambda: parameter to scale regularization by

        p_value:    determines what type of Lp norm to calculate. If p > 0,
                    we will calculate Lp norm with the formula:
                    pow( sum_i { pow(theda_i, p) } ,  1/p)
        r   7factor ahead of regularization should be greater than 0z'p_value factor should be greater than 0N)r4   r=   r   r   p_valuer5   )r   r5   rD   r7   r   r	   r   h   s
    zLpNorm.__init__Nc           
      C   s   | |d }| ||}|j|g| dg| jd}||g| dg}|j|g| dgd| j d}	|j|	g|g| jd |S )N_dense_feature_regularizationlp_vec_raisedr"   lp_vec_summedZlp_vecr8   r;   )r%   r(   r'   rD   r&   r>   r5   )
r   r   r   r   r   r@   r    rF   rG   Zlp_normr   r   r	   r   w   s$    
 

zLpNorm._run_on_loss)r!   )NrA   r   r   r7   r	   r=   g   s   r=   c                       s(   e Zd Zd fdd	Zd	ddZ  ZS )
L0ApproxNorm{Gz?r   c                    sX   t t|   |dkstd|dks.td|dks>td|| _|| _t|| _dS )a9  
        reg_lambda: parameter to scale regularization by

        alpha:      hyper parameter to tune that is only used in the calculation
                    of approximate L0 norm

        budget:     desired number of features. If the number of features is greater
                    than the budget amount, then the least important features will
                    be penalized. If there are fewer features than the desired
                    budget, no penalization will be applied. Optional parameter, if
                    0, then no budget is used
        r   rC   z4alpha factor must be a positive value greater than 0z0budget factor must be greater than or equal to 0N)r4   rH   r   r   r5   alphafloatbudget)r   r5   rJ   rL   r7   r   r	   r      s    zL0ApproxNorm.__init__Nc                 C   s   | |d }| ||}||g| dg}|j|g| dg| jd}||g| dg}	|j|	g| dgd| j d}
| jr|jg d	dg| jd
}|	|
|g| dg}|
|g| dg}|j|g|g| jd n|j|
g|g| jd |S )NrE   l0_absl0_min)r*   	l0_summedl0_normr8   r;   rL   )shapevalueZ	l0_budgetrelu_l0_sub_budget)r%   r(   AbsCliprJ   r&   r>   rL   ConstantFillSubZRelur5   )r   r   r   r   r   r@   r    rM   rN   rO   rP   Zbudget_blobZl0_sub_budgetrS   r   r   r	   r      s.     
  
 
zL0ApproxNorm._run_on_loss)rI   r   )NrA   r   r   r7   r	   rH      s   rH   c                       s*   e Zd ZdZ fddZdddZ  ZS )L1NormTrimmedzV
    The Trimmed Lasso: Sparsity and Robustness. https://arxiv.org/abs/1708.04527
    c                    sP   t t|   |dkstdt|ts0td|dks@td|| _|| _d S )Nr   r3   z6k should be an interger as expected #. after selectionr8   zk should be larger than 1)r4   rX   r   r   r   intr5   k)r   r5   rZ   r7   r   r	   r      s    zL1NormTrimmed.__init__Nc                 C   s   | |d }||g| dg}|j|g| dgdd}|j|g| d| d| dg| jd	\}}	}	|j|g| d
gdd}
|||
g|g |j|g|g| jd |S )NZ_l1_trimmed_regularizationabssum_absFZaveragetopkidflat_idrZ   topk_sumr;   )r%   rT   SumElementsTopKrZ   rW   r>   r5   )r   r   r   r   r   r@   r[   r\   r^   _rb   r   r   r	   r      s    2zL1NormTrimmed._run_on_loss)Nr   r   r   __doc__r   r   rB   r   r   r7   r	   rX      s   	rX   c                       s&   e Zd Z fddZdddZ  ZS )L2Normc                    s(   t t|   |dkstd|| _d S r2   )r4   rh   r   r   r5   r6   r7   r   r	   r      s    zL2Norm.__init__Nc                 C   s<   | |d }|j|g|gdd |j|g|g| jd |S )NZ_l2_regularization   r9   r;   r<   r?   r   r   r	   r      s    zL2Norm._run_on_loss)NrA   r   r   r7   r	   rh      s   rh   c                       s&   e Zd Z fddZdddZ  ZS )
ElasticNetc                    s   t t|   || _|| _d S r   )r4   rj   r   l1l2)r   rk   rl   r7   r   r	   r      s    zElasticNet.__init__Nc                 C   s   | |d }| |d }| |d }|j|g|gdd |j|g|gdd |j|g|g| jd |j|g|g| jd |||g|g |S )NZ_elastic_net_regularization_l2_blob_l1_blobri   r9   r8   r;   )r%   r=   r>   rl   rk   Add)r   r   r   r   r   r@   l2_blobl1_blobr   r   r	   r      s    zElasticNet._run_on_loss)NrA   r   r   r7   r	   rj      s   rj   c                       s&   e Zd Z fddZdddZ  ZS )ElasticNetL1NormTrimmedc                    s$   t t|   || _|| _|| _d S r   )r4   rr   r   rk   rl   rZ   )r   rk   rl   rZ   r7   r   r	   r     s    z ElasticNetL1NormTrimmed.__init__Nc                 C   s  | |d }| |d }|j|g|gdd |j|g|g| jd | |d }||g| dg}|j|g| dgd	d
}	|j|g| d| d| dg| jd\}
}}|j|
g| dgd	d
}||	|g|g |j|g|g| j	d |
||g|g |S )NZ&_elastic_net_l1_trimmed_regularizationrm   ri   r9   r;   rn   r[   r\   Fr]   r^   r_   r`   ra   rb   )r%   r=   r>   rl   rT   rc   rd   rZ   rW   rk   ro   )r   r   r   r   r   r@   rp   rq   r[   r\   r^   re   rb   r   r   r	   r   	  s    2z$ElasticNetL1NormTrimmed._run_on_loss)NrA   r   r   r7   r	   rr     s   rr   c                       s&   e Zd Zd fdd	Zdd Z  ZS )MaxNorm      ?Nc                    s   t t|   || _|| _d S r   )r4   rs   r   normdtype)r   ru   rv   r7   r   r	   r     s    zMaxNorm.__init__c                 C   sv   | j dkstdt|tjrj| jrL| jdkrL|j||jg|gd| j d qr|j||jg|gd| j d nt	dd S )Nr   norm should be bigger than 0.Zfp16TZuse_max_normru   z-MaxNorm is not supported for dense parameters)
ru   r   r   r   r+   rv   ZFloat16SparseNormalizer,   SparseNormalizeNotImplementedErrorr   r   r   r	   r   !  s     zMaxNorm._run_after_optimizer)rt   Nr   r   r   r   r   rB   r   r   r7   r	   rs     s   rs   c                       s&   e Zd Zd fdd	Zdd Z  ZS )ConstantNormrt   c                    s   t t|   || _d S r   )r4   r|   r   ru   )r   ru   r7   r   r	   r   7  s    zConstantNorm.__init__c                 C   sH   | j dkstdt|tjr<|j||jg|gd| j d ntdd S )Nr   rw   Frx   z2ConstantNorm is not supported for dense parameters)ru   r   r   r   r+   ry   r,   rz   r   r   r   r	   r   ;  s    z!ConstantNorm._run_after_optimizer)rt   r{   r   r   r7   r	   r|   6  s   r|   c                       s$   e Zd Z fddZdd Z  ZS )SparseLpNormc                    s>   t t|   |dkstd|dks.td|| _|| _d S )N)rt          @zBSparse Lp regularization only implemented for p = 1.0 and p = 2.0.r   z8factor ahead of regularization should be greater than 0.)r4   r}   r   r   r:   r5   )r   r:   r5   r7   r   r	   r   K  s
    zSparseLpNorm.__init__c                 C   s8   t |tjr,|j||jg|g| j| jd ntdd S )Nr:   r5   z2SparseLpNorm is not supported for dense parameters)r   r   r+   ZSparseLpRegularizerr,   r:   r5   rz   r   r   r   r	   r   R  s    z!SparseLpNorm._run_after_optimizerr{   r   r   r7   r	   r}   J  s   r}   c                       s   e Zd Z fddZ  ZS )SparseL1Normc                    s   t t| jd|d d S )Nrt   r   )r4   r   r   r6   r7   r   r	   r   _  s    zSparseL1Norm.__init__r   r   r   r   rB   r   r   r7   r	   r   ^  s   r   c                       s   e Zd Z fddZ  ZS )SparseL2Normc                    s   t t| jd|d d S )Nr~   r   )r4   r   r   r6   r7   r   r	   r   d  s    zSparseL2Norm.__init__r   r   r   r7   r	   r   c  s   r   c                       s4   e Zd ZdZd
 fdd	ZdddZdd	 Z  ZS )
LogBarrierzr
    Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science,
    35(67-68), 7. Chapter 19
    invNc                    s>   t t|   |dkstd|| _|| _|p6ddd| _dS )z
        discount is a positive weight that is decreasing, and here it is implemented
        similar to the learning rate. It is specified by a learning rate policy and
        corresponding options
        r   r3   rt   )gammapowerN)r4   r   r   r   r5   discount_policydiscount_options)r   r5   r   r   r7   r   r	   r   n  s
    zLogBarrier.__init__c                 C   s   t ||}||d }|j|g|gf| j | jd| j ||d }|j|g|g| jd ||d }|	|g|g ||d }	|
|g|	g ||d }
|j|	|g|
gdd	 |
S )
NZ_log_barrier_discount)Zbase_lrpolicyZ_non_neg)r)   _logZ_log_sumZ_log_barrierr8   	broadcast)r   ZBuildUniqueMutexIterr%   ZLearningRater5   r   r   rU   r   Logrc   r$   )r   r   r   r   r   	iterationZdiscountZparam_non_negZ	param_logZparam_log_sumr@   r   r   r	   r   z  s(    
zLogBarrier._run_on_lossc                 C   s   | j |||ddd d S )Nr   T)r)   r-   )r0   r   r   r   r	   r     s    zLogBarrier._run_after_optimizer)r   N)N)r   r   r   rg   r   r   r   rB   r   r   r7   r	   r   h  s   
r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	BoundedGradientProjectionzr
    Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science,
    35(67-68), 7. Chapter 16
    NFc                    s   t t|   |d k	rt|nd }|d k	r2t|nd }|d k	rFt|n| j}|dksdtdj|d|d ks|d ks||r~|nd ||r|nd kstdj|||rdnd|rdnd	|d
|| _|| _|| _|| _	|| _
d S )Nr   z2Bounded Gradient Projection with invalid eps={eps})epsg        zLBounded Gradient Projection with invalid {lp}ub={ub}, lb={lb}{rp}, eps={eps}([)])lbublprpr   )r4   r   r   rK   r   r   r   r.   r/   r   r   )r   r   r   r.   r/   epsilonr7   r   r	   r     s8    

z"BoundedGradientProjection.__init__c              	   C   s$   | j |||| j| j| j| jd d S )N)r)   r*   r.   r/   )r0   r   r   r.   r/   r   r   r   r	   r     s    z.BoundedGradientProjection._run_after_optimizer)NNFFN)r   r   r   rg   r   r   rB   r   r   r7   r	   r     s            !r   c                       s,   e Zd ZdZd fdd	Zd	ddZ  ZS )
GroupL1Norma  
    Scardapane, Simone, et al. "Group sparse regularization for deep neural networks."
    Neurocomputing 241 (2017): 81-89.

    This regularizer computes l1 norm of a weight matrix based on groups.
    There are essentially three stages in the computation:
    1. Compute the l2 norm on all the members of each group
    2. Scale each l2 norm by the size of each group
    3. Compute the l1 norm of the scaled l2 norms
    r   c                    sF   t t|   |dkstdt|ts0td|| _|| _|| _dS )a  
        Args:
            reg_lambda: The weight of the regularization term.
            groups: A list of integers describing the size of each group.
                The length of the list is the number of groups.

        Optional Args:
            stabilizing_val: The computation of GroupL1Norm involves the Sqrt
                operator. When values are small, its gradient can be numerically
                unstable and causing gradient explosion. Adding this term to
                stabilize gradient calculation. Recommended value of this term is
                1e-8, but it depends on the specific scenarios. If the implementation
                of the gradient operator of Sqrt has taken into stability into
                consideration, this term won't be necessary.
        r   z-regularization weight should be 0 or positivezgroups needs to be a listN)	r4   r   r   r   r   listr5   groupsstabilizing_val)r   r5   r   r   r7   r   r	   r     s    zGroupL1Norm.__init__Nc              	   C   s   | |}|j|dgdd}|||jg dt| jg| jdg}| jrl|j||jg d| jdg|gdd |	|}|
||jg t| jgt| j| j dgdg}	|j|	dgdd	}
|
S )
a  
        Args:
            param: The input blob to regularize. It should be a weight matrix
                blob with shape (output_dim, input_dim). input_dim should be
                equal to the sum of self.groups.

        Returns:
            group_l1_norm: The output blob after applying regularization.

        These are the steps of computation:
            1. square all elements
            2. sum by row
            3. lengthssum by group
            4. square_root all elements
            5. normalize each group based on group size
            6. compute l1 norm of each group
            7. scale the result with the regularization lambda
        r   )ZaxesZkeepdimsr8   )rQ   r   )rR   r   Znormalized_l2_norm_scaledZgroup_l1_nromr9   )ZSqrZ	ReduceSumZ
LengthsSumZGivenTensorIntFilllenr   r   ro   rV   ZSqrtr$   ZGivenTensorFillnpsqrtr5   r=   )r   r   r   r   r   ZsquaredZreduced_sumZlengths_sumr   Z	l2_scaledZgroup_l1_normr   r   r	   r     s>    
  
 	

zGroupL1Norm._run_on_loss)r   )Nrf   r   r   r7   r	   r     s   
r   )Zcaffe2.pythonr   r   Znumpyr   objectr   r
   r1   r=   rH   rX   rh   rj   rr   rs   r|   r}   r   r   r   r   r   r   r   r   r	   <module>   s$   L,7.3