U
    9%eO"                     @   s  d dl Z d dlmZ d dlZd dlmZ d dlmZmZ ddlm	Z	 e	
eZG dd dejZG d	d
 d
ejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd  d eZeed!d"d#feeed$d%ifeeeeeeejeejejeeejd&ZeeZ d'd( Z!e!d)Z"e!d*Z#e!d+Z$e!d,Z%e!d-Z&e!d.Z'e!d/Z(e!d0Z)dS )1    N)OrderedDict)version)Tensornn   )loggingc                       s0   e Zd ZdZ fddZeedddZ  ZS )PytorchGELUTanha  
    A fast C implementation of the tanh approximation of the GeLU activation function. See
    https://arxiv.org/abs/1606.08415.

    This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
    match due to rounding errors.
    c                    s6   t    ttjtdk r2tdtj dd S )Nz1.12.0zYou are using torch==zM, but torch>=1.12.0 is required to use PytorchGELUTanh. Please upgrade torch.)super__init__r   parsetorch__version__ImportErrorself	__class__ W/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/activations.pyr
   %   s
    
zPytorchGELUTanh.__init__inputreturnc                 C   s   t jj|ddS )Ntanh)Zapproximate)r   
functionalgelur   r   r   r   r   forward-   s    zPytorchGELUTanh.forward__name__
__module____qualname____doc__r
   r   r   __classcell__r   r   r   r   r      s   r   c                   @   s    e Zd ZdZeedddZdS )NewGELUActivationz
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
    r   c                 C   s6   d| dt tdtj |dt |d     S )N      ?      ?       @Hm?g      @)r   r   mathsqrtpipowr   r   r   r   r   7   s    zNewGELUActivation.forwardNr   r   r    r!   r   r   r   r   r   r   r#   1   s   r#   c                       sH   e Zd ZdZded fddZeedddZeedd	d
Z  Z	S )GELUActivationa  
    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
    F)use_gelu_pythonc                    s&   t    |r| j| _n
tjj| _d S N)r	   r
   _gelu_pythonactr   r   r   )r   r.   r   r   r   r
   C   s    

zGELUActivation.__init__r   c                 C   s    |d dt |td   S )Nr$   r%   r&   )r   erfr(   r)   r   r   r   r   r0   J   s    zGELUActivation._gelu_pythonc                 C   s
   |  |S r/   r1   r   r   r   r   r   M   s    zGELUActivation.forward)F)
r   r   r    r!   boolr
   r   r0   r   r"   r   r   r   r   r-   ;   s   r-   c                   @   s    e Zd ZdZeedddZdS )FastGELUActivationz}
    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
    r   c                 C   s*   d| dt |d dd| |     S )Nr$   r%   g3E?r'   )r   r   r   r   r   r   r   V   s    zFastGELUActivation.forwardNr,   r   r   r   r   r5   Q   s   r5   c                   @   s    e Zd ZdZeedddZdS )QuickGELUActivationzr
    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
    r   c                 C   s   |t d|  S )NgZd;?)r   sigmoidr   r   r   r   r   _   s    zQuickGELUActivation.forwardNr,   r   r   r   r   r6   Z   s   r6   c                       s8   e Zd ZdZeed fddZeedddZ  ZS )ClippedGELUActivationa  
    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
    https://arxiv.org/abs/2004.09602.

    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
    initially created.

    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
    minmaxc                    s8   ||krt d| d| dt   || _|| _d S )Nzmin should be < max (got min: z, max: ))
ValueErrorr	   r
   r:   r;   )r   r:   r;   r   r   r   r
   p   s
    
zClippedGELUActivation.__init__)xr   c                 C   s   t t|| j| jS r/   )r   Zclipr   r:   r;   )r   r>   r   r   r   r   x   s    zClippedGELUActivation.forward)	r   r   r    r!   floatr
   r   r   r"   r   r   r   r   r8   c   s   r8   c                       s0   e Zd ZdZ fddZeedddZ  ZS )AccurateGELUActivationz
    Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
    https://github.com/hendrycks/GELUs

    Implemented along with MEGA (Moving Average Equipped Gated Attention)
    c                    s    t    tdtj | _d S )N   )r	   r
   r(   r)   r*   precomputed_constantr   r   r   r   r
      s    
zAccurateGELUActivation.__init__r   c                 C   s,   d| dt | j|dt |d     S )Nr$   r   r'      )r   r   rB   r+   r   r   r   r   r      s    zAccurateGELUActivation.forwardr   r   r   r   r   r@   |   s   r@   c                   @   s    e Zd ZdZeedddZdS )SiLUActivationa  
    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
    later.
    r   c                 C   s   t j|S r/   )r   r   silur   r   r   r   r      s    zSiLUActivation.forwardNr,   r   r   r   r   rD      s   rD   c                       s@   e Zd ZdZ fddZeedddZeedddZ  ZS )	MishActivationz
    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
    visit the official repository for the paper: https://github.com/digantamisra98/Mish
    c                    s8   t    ttjtdk r*| j| _n
tj	j
| _d S )Nz1.9.0)r	   r
   r   r   r   r   _mish_pythonr1   r   r   mishr   r   r   r   r
      s    

zMishActivation.__init__r   c                 C   s   |t tj| S r/   )r   r   r   r   Zsoftplusr   r   r   r   rG      s    zMishActivation._mish_pythonc                 C   s
   |  |S r/   r3   r   r   r   r   r      s    zMishActivation.forward)	r   r   r    r!   r
   r   rG   r   r"   r   r   r   r   rF      s   rF   c                   @   s    e Zd ZdZeedddZdS )LinearActivationz[
    Applies the linear activation function, i.e. forwarding input directly to output.
    r   c                 C   s   |S r/   r   r   r   r   r   r      s    zLinearActivation.forwardNr,   r   r   r   r   rI      s   rI   c                   @   s   e Zd ZdZdddZdS )LaplaceActivationz
    Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
    https://arxiv.org/abs/2209.10655

    Inspired by squared relu, but with bounded range and gradient for better stability
    绹۞? ^/?c                 C   s*   ||  |td }ddt|  S )Nr&   r$   r%   )divr(   r)   r   r2   )r   r   musigmar   r   r   r      s    zLaplaceActivation.forwardN)rK   rL   r   r   r    r!   r   r   r   r   r   rJ      s   rJ   c                   @   s   e Zd ZdZdd ZdS )ReLUSquaredActivationzX
    Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
    c                 C   s   t j|}t|}|S r/   )r   r   relur   Zsquare)r   r   Zrelu_appliedZsquaredr   r   r   r      s    
zReLUSquaredActivation.forwardNrP   r   r   r   r   rQ      s   rQ   c                       s   e Zd Z fddZ  ZS )ClassInstantierc                    s0   t  |}t|tr|n|i f\}}|f |S r/   )r	   __getitem__
isinstancetuple)r   keycontentclskwargsr   r   r   rT      s    zClassInstantier.__getitem__)r   r   r    rT   r"   r   r   r   r   rS      s   rS   i
   r9   r.   T)r   Zgelu_10	gelu_fastgelu_newgelu_pythonZgelu_pytorch_tanhZgelu_accurateZlaplacelinearrH   
quick_gelurR   Zrelu2Zrelu6r7   rE   Zswishr   c                 C   s0   | t krt |  S td|  dtt   d S )Nz	function z not found in ACT2FN mapping )ACT2FNKeyErrorlistkeys)Zactivation_stringr   r   r   get_activation   s    re   r^   r]   r   r\   r`   rE   rH   r_   )*r(   collectionsr   r   	packagingr   r   r   utilsr   Z
get_loggerr   loggerModuler   r#   r-   r5   r6   r8   r@   rD   rF   rI   rJ   rQ   rS   ZReLUZReLU6ZSigmoidZTanhZACT2CLSra   re   r^   r]   r   r\   r`   rE   rH   Z
linear_actr   r   r   r   <module>   s`   

			
