U
    ,-e                    @   s  d Z ddlZddlmZmZmZmZ ddlZddlm	  m
Z ddlZddlm	Z	 ddlmZmZmZ ddlmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZ ddlm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& e#'e(Z)dZ*dZ+dgZ,G dd de	j-Z.G dd de	j-Z/G dd de	j-Z0G dd de	j-Z1G dd de	j-Z2G dd de	j-Z3G dd de	j-Z4e5e4 G dd de	j-Z6G d d! d!e	j-Z7G d"d# d#e	j-Z8G d$d% d%e	j-Z9G d&d' d'e	j-Z:G d(d) d)e	j-Z;G d*d+ d+eZ<d,Z=d-Z>e!d.e=G d/d0 d0e<Z?e!d1e=G d2d3 d3e<Z@e!d4e=G d5d6 d6e<ZAe!d7e=G d8d9 d9e<ZBe!d:e=G d;d< d<e<ZCe!d=e=G d>d? d?e<ZDG d@dA dAe	j-ZEe!dBe=G dCdD dDe<ZFdS )EzPyTorch MEGA model.    N)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN),BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)ALL_LAYERNORM_LAYERS)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )
MegaConfigzmnaylor/mega-base-wikitextr   c                       s0   e Zd ZdZed fddZdddZ  ZS )	MegaEmbeddingsz
    Mega's basic implementation does not incorporate token type embeddings, so this is a stripped-down version of
    RoBERTa's embeddings which optionally includes token types
    configc                    sr   t    tj|j|j|jd| _|j| _	| j	rft|j
|j| _| jdtj|jtjdddd |j| _d S )N)padding_idxtoken_type_idsdtype)r   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizeZpad_token_idword_embeddingsZadd_token_type_embeddingsuse_token_typesZtype_vocab_sizetoken_type_embeddingsregister_buffertorchzerosmax_positionslongexpandr   selfr   	__class__ g/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/mega/modeling_mega.pyr&   A   s    
  zMegaEmbeddings.__init__Nc           
      C   s   |d kr|d krt dn8|d k	r<| }|j}| |}n| d d }|j}| jr|d krt| dr| jd d d |d f }||d |d }|}ntj	|tj
|d}| |}|| }	n|}	|	S )Nz.Must provide one of input_ids or inputs_embedsr#   r    r   r   r"   device)
ValueErrorsizer:   r*   r+   hasattrr    r2   r.   r/   r1   r,   )
r4   	input_idsr    inputs_embedsinput_shaper:   Zbuffered_token_type_idsZ buffered_token_type_ids_expandedr,   Z
embeddingsr7   r7   r8   forwardO   s&    



zMegaEmbeddings.forward)NNN__name__
__module____qualname____doc__r   r&   rA   __classcell__r7   r7   r5   r8   r   ;   s   r   c                       s.   e Zd ZdZed fddZdd Z  ZS ) MegaSimpleRelativePositionalBiaszs
    Simple relative positional embeddings copied from the Mega repo; renamed variables for better readability
    r   c                    sN   t    || _| jjdk r$| jjn| jj| _ttd|j d | _	d S )Nr      r   )
r%   r&   r   
chunk_sizer0   r   	Parameterr.   Tensorrel_pos_biasr3   r5   r7   r8   r&   v   s    
z)MegaSimpleRelativePositionalBias.__init__c                 C   s   || j krtd|| j | j| j | | j | d  }t|d|f}t||f}|d |  }||d| d }d| d d }|	d| }|d d ||f }|S )Nz-Sequence length {} going beyond max length {}r   r   r
   rI   )
r0   r;   formatrM   Fpadr.   tileviewr<   )r4   seq_lenbiasrQ   startendr7   r7   r8   rA   |   s    
z(MegaSimpleRelativePositionalBias.forwardrB   r7   r7   r5   r8   rH   q   s   rH   c                       sJ   e Zd ZdZed fddZeeedddZdd	 Z	d
d Z
  ZS ) MegaRotaryRelativePositionalBiasap  
    Rotary relative bias for positional information; similar in concept to RoPE (i.e. RoFormer) but taken from the Mega
    repo due to differences in implementation.

    When initialized, produces a positional bias which ranges from position 0 to config.max_positions, but can
    extrapolate to longer sequences. Can be indexed according to input position IDs
    r   c                    s   t    |jd dkr td|| _|j| _| jjdk rB| jjn| jj| _t	
|j| j\| _| _ttd| j| _ttd| j| _| dtdg d S )NrI   r   zCRotary positional bias requires `hidden_size` to be a multiple of 2r   _float_tensor        )r%   r&   r)   RuntimeErrorr   shared_representation_size	embed_dimrJ   r0   rW   get_sinusoid_embeddingssinecosiner   rK   r.   rL   alphab_paramr-   FloatTensorr3   r5   r7   r8   r&      s    
 z)MegaRotaryRelativePositionalBias.__init__)r0   embedding_dimc                 C   sf   |d }t d| }ttj|tjd|  }tj| tjdd|d }t|t|fS )NrI   i'  r!   r   r   )	mathlogr.   exparangefloat	unsqueezesincos)r0   rc   Zhalf_dimZembr7   r7   r8   r]      s
     z8MegaRotaryRelativePositionalBias.get_sinusoid_embeddingsc                 C   s   |  \}}tj|ddd\}}| jd ks:|| j dkrTt||\| _| _|| _| j| j	| _| j| j	| _| jd | }| jd | }tj
|| ||  || ||  gddS )NrI   r#   dimr   r   )r<   r.   chunkr^   rW   r]   r_   r0   torX   cat)r4   inputrS   r\   Zchunk_1Zchunk_2rj   rk   r7   r7   r8   rotary   s    z'MegaRotaryRelativePositionalBias.rotaryc                 C   s>   |  | j|| j}|  | j|| j}td||}|S )Nz	mk,nk->mn)rr   r`   r2   r\   ra   r.   einsum)r4   rS   Zrotary_alphaZrotary_betarT   r7   r7   r8   rA      s    z(MegaRotaryRelativePositionalBias.forward)rC   rD   rE   rF   r   r&   staticmethodintr]   rr   rA   rG   r7   r7   r5   r8   rW      s   rW   c                       s2   e Zd ZdZd fdd	Zd	edddZ  ZS )
MegaDropoutab  
    A unified class for standard dropout functionality and featurewise dropout.

    The original fairseq Mega repo used 2 classes for these, which included some unnecessary handling of training logic
    and an unused `inplace` option. The original implementation used torch.nn.functional instead of submodules, which
    is retained here as well.
    Fc                    s   t    || _|| _d S N)r%   r&   dropout_probabilityis_featurewise)r4   rx   ry   r5   r7   r8   r&      s    
zMegaDropout.__init__batch_firstc                 C   s   | j rl|r.tj|dd| j| jdddS | dkrBtdtj|ddd| j| jddddS ntj	|| j| jdS d S )	Nr#   )ptrainingr
   zzFeature dropout inputs must be exactly 3-dimensional if inputs are ordered [sequence length, batch size, hidden dimension]r   rI   r   )
ry   rO   Z	dropout2d	transposerx   r~   rm   r;   permutedropout)r4   rq   r{   r7   r7   r8   rA      s(    
     zMegaDropout.forward)F)F)rC   rD   rE   rF   r&   boolrA   rG   r7   r7   r5   r8   rv      s   rv   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	MegaRMSNormz
    RMSNorm used in Mega implementation. Differs from T5's RMSNorm by applying the weight prior to taking the square
    root (as opposed to after in T5)
    ư>Tc                    sF   t    || _|| _|| _|r6tt| j| _	n| 
dd  d S )Nweight)r%   r&   Znum_featuresepsaffiner   rK   r.   rL   r   register_parameter)r4   Znumber_featuresr   r   r5   r7   r8   r&      s    
zMegaRMSNorm.__init__c                 C   sB   t jt |ddd}| jd k	r*|| j }|t || j   |S )Nr#   Trm   keepdim)r.   meansquarer   rsqrtr   )r4   rq   mean_squarer7   r7   r8   rA      s
    

zMegaRMSNorm.forward)r   TrC   rD   rE   rF   r&   rA   rG   r7   r7   r5   r8   r      s   
r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	MegaScaleNormz
    Scale normalization introduced in MEGA which is similar to RMSNorm, but uses a single parameter for scalar
    multiplication instead of a vector, and applies over a specified dimension
    r   Tc                    sD   t    || _|| _|| _|r4ttd| _	n| 
dd  d S )Nr   scalar)r%   r&   rm   r   r   r   rK   r.   rL   r   r   )r4   rm   r   r   r5   r7   r8   r&   	  s    
zMegaScaleNorm.__init__c                 C   sD   t jt || jdd}| jd k	r,| j| }|t || j  }|S )NTr   )r.   r   r   rm   r   r   r   )r4   rq   r   outputr7   r7   r8   rA     s
    

zMegaScaleNorm.forward)r   Tr   r7   r7   r5   r8   r     s   
r   c                       s*   e Zd ZdZd	 fdd	Zdd Z  ZS )
MegaSequenceNormz
    A wrapper class for various layer normalization options used in Mega. Used to handle differences in expectations on
    input axis locations for different normalization methods.
    h㈵>TFc                    s   t    |dkr&tj|||d| _nz|dkr@td||d| _n`|dkrZt|||d| _nF|dkrvtj|||d| _n*|d	krtj|||d| _nt	d

|d S )NZ	layernorm)Zelementwise_affineZ	scalenormr#   )rm   r   r   Zrmsnorm)r   r   	batchnormZsyncbatchnormzUnknown norm type: {})r%   r&   r   	LayerNormnormr   r   ZBatchNorm1dZSyncBatchNormr;   rN   )r4   Z	norm_typerc   r   r   Zexportr5   r7   r8   r&   "  s    
zMegaSequenceNorm.__init__c                 C   sZ   t | jtjjjrL| dkr&td|ddd}| |}|dddS | |S d S )Nr
   z.BatchNorm inputs must be exactly 3-dimensionalr   rI   r   )	
isinstancer   r   modulesr   Z
_BatchNormrm   r;   r   )r4   rq   r7   r7   r8   rA   1  s    
zMegaSequenceNorm.forward)r   TFr   r7   r7   r5   r8   r     s   r   c                       s   e Zd ZdZed fddZdd Zeddd	Zd
d Z	edddZ
dd ZdddZdddZdeej eej eejdddZ  ZS )MegaMultiDimensionDampedEmaa  
    Mega's Exponential Moving Average layer, largely left unmodified from the original repo with the exception of
    variable names and moving away from the stateful representation of incremental decoding state. See
    "https://arxiv.org/abs/2209.10655" for more details.
    r   c                    s   t    || _|j| _|j| _|j| _|j| _t	
d| j | _| jrRd|j n|j}tt|| jd| _tt|| jd| _tt|| jd| _tt|| j| _tt|j| _d | _d | _d S )N      ?rI   r   )r%   r&   r   r)   r\   ema_projection_sizendimbidirectional
truncationrd   sqrtscaler   rK   r.   rL   damping_factordecay_factorema_expansion_matrixkernel_projection_matrixresidual_weight_kernel_coeffs)r4   r   Z
kernel_dimr5   r7   r8   r&   G  s    
z$MegaMultiDimensionDampedEma.__init__c                 C   s2   d | _ t| j}t| j}d||  }||fS )Nr   )r   r.   sigmoidr   r   )r4   r   r   previous_timestep_weightr7   r7   r8   _compute_ema_coefficients_  s
    z5MegaMultiDimensionDampedEma._compute_ema_coefficientslengthc                 C   s`   d | _ |  \}}t||dd|t| }|| j t| }t	d|| j
| j S )Nr   
dnl,dn->dl)r   r   r.   rg   ro   rR   re   r   rf   rs   r   r   )r4   r   r   r   vanderkernelr7   r7   r8   _compute_efficient_ema_kernelg  s
    $z9MegaMultiDimensionDampedEma._compute_efficient_ema_kernelc                 C   s,   | j r|  S | jd kr"|  | _| jS d S rw   )r~   r   r   r4   r7   r7   r8   get_ema_coefficientss  s
    

z0MegaMultiDimensionDampedEma.get_ema_coefficientsc                 C   sf   | j d kr|n
t| j |}| jr*| |S | jd ksD| jd|k rP| || _| jdd |f S d S )Nr#   .)r   minr~   r   r   r<   )r4   r   kernel_sizer7   r7   r8   get_ema_kernel{  s    
z*MegaMultiDimensionDampedEma.get_ema_kernelc                 C   sL   t jj| d| d}t jj| d| d}t jj|| d| d}|S )NrI   )n)r.   ZfftZrfftrh   Zirfft)r4   inputsr   r   Z
inputs_fftZ
kernel_fftZconvolved_sequencer7   r7   r8   fft_convolution  s    z+MegaMultiDimensionDampedEma.fft_convolutionNc                 C   sr  |dkr| j ||dS |  \}}t|d |dd|d t| }t|}|d k	r|d d d d dd f | j| j	 
d }td||}|d d d d df | }	nd }d }	|d d d d d df }|| j | }
td|
| j| j	 }| j|||ddd|f }||}|d k	r4|| }td	|tj|
d
gd}|	d k	r`||	 }|d
dd|fS )Nr   
past_stater#   zbdn,dnl->bdlr   r   .r   zbdl,dnl->bdnrI   )Zdims)one_ema_stepr   r.   rg   ro   rR   re   rf   r   r   ri   rs   r   r   type_asflipr   )r4   r   r   r   r   r   r   Zpast_ema_projZpast_ema_stateZpast_vandermonder   Zkernel_proj
ema_outputZupdated_hidden_stater7   r7   r8   ema_step  s0    $
,


z$MegaMultiDimensionDampedEma.ema_stepc                 C   s^   |   \}}|| j d| }|d k	r:||d|  }td|| j| j }|d|fS )Nr#   z
bdn,dn->bdr   )r   r   squeezer.   rs   r   r   ri   )r4   r   r   r   r   updated_stateoutr7   r7   r8   r     s    z(MegaMultiDimensionDampedEma.one_ema_stepF)attention_mask
prev_state	use_cachereturnc                 C   s  |  \}}}|| jkr.td| d| j || j }|ddd}|dk	rb||d| }| jrt|rttd|r| j	|||d\}	}
t
|	| }	|	|
fS | |}|}d}| d}| jr<tj|| j| jgdd	\}}t
||d dft
|d
d|d f }t
||d df}|| d }d| d }| j|||dd||| f }||}t
|ddd| }|dfS dS )a  
        Mega's exponential moving average (EMA) sub-layer applied prior to single-headed (traditional) self-attention

        Args:
            inputs (`torch.Tensor` of shape `(sequence_length, batch_size, hidden_size)`):
                Hidden state / embedding input to update via EMA based on FFT convolution
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Indicates which inputs are to be ignored (mostly due to padding), where elements are either 1 for *not
                masked* or 0 for *masked*
            prev_state (`torch.Tensor` of shape `(batch_size, config.ndim)`, *optional*):
                The hidden state returned from the previous timestep during incremental decoding.
            use_cache (`bool`, default `False`):
                Whether to perfom incremental decoding; uses `prev_state` as the prior timestep, and returns the
                updated EMA hidden state for use in the next step

        Returns:
            `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
            inputs:
            - **hidden_states** (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`) -- Hidden
              states updated by EMA, with same shapes as inputs
            - **updated_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor of shape `(batch_size,
              config.ndim)` -- The incremental EMA state for use in the next step of incremental decoding
        2Unexpected embedding dimension received: input is z, model expects r   rI   r   Nz4Bidirectional EMA does not support incremental stater   rl   r#   r   .)r<   r\   r;   r   r   ri   r   r   rZ   r   rO   silur   r.   splitrP   r   r   )r4   r   r   r   r   rS   bszr\   residualr   r   r   Zfft_lenZs_indexr   Zk1Zk2r   Zgated_ema_outputr7   r7   r8   rA     s:    




. 
z#MegaMultiDimensionDampedEma.forward)N)N)NNF)rC   rD   rE   rF   r   r&   r   ru   r   r   r   r   r   r   r   r.   rL   r   rA   rG   r7   r7   r5   r8   r   @  s$   	
*
   r   c                       s~   e Zd ZdZed fddZdd Zdd Zdee	j
 ee	j
 ee	j
 eee	j
  eeee	j
ee	j
 f dddZ  ZS )MegaGatedCrossAttentiona<  
    Gated Structured State Attention for use in encoder-decoder model. See Mega paper for more details. Only
    modifications from original implementation are variable names, removing the unnecessary `before_attn_fn` and
    `static_kv` arguments, and the stateful representation of incremental decoder state.
    r   c                    sd  t    || _t| jj | _| jj| _| jdkr>| jjd nd | _t| jj	| jj
d| _t| jj| jj
d| _t| jjdd| _| jj| _t| jj| jj| jjd| _t| jj| jj| _t| jj| jj| _t| jjd| jj | jj | _t| jj| jj| _| jjdkr&t|| _n,| jjdkr@t || _nt!d	"| jjtj#d
d| _$d S )Nsoftmax      ࿩ry   Fr   rI   simplerr   z"unknown relative position bias: {}r#   rl   )%r%   r&   r   r   
activationattention_activationr[   scalingrv   dropout_probuse_feature_dropoutr   hidden_dropout_probhidden_dropoutattention_probs_dropout_probattention_dropoutnormalize_before_megaprenormr   normalization_typer)   norm_affiner   r   Lineark_projv_projq_projh_projrelative_positional_biasrH   rM   rW   r;   rN   Softmaxr   r3   r5   r7   r8   r&     s>    

 
   z MegaGatedCrossAttention.__init__c                 C   s   |  \}}}|d kr | dn|d }|d k	rH|jdd|dd}	n|}	| t||d d d |f }
|d k	r| ddkrtd|
| }
n|
d | }
t||dd|	 |
 }t	| j
 ||}|d k	r||d }|S )Nr   r#   rl   9Position offset provided with queries longer than 1 tokenrI   )r<   sumrR   rM   maxr;   r.   bmmr   r   r   r   ri   )r4   querykeykey_padding_maskpidxr   src_len_tgt_lenlengthsrT   qkattn_weightsr7   r7   r8   element_attention>  s      
z)MegaGatedCrossAttention.element_attentionc                 C   s   |  \}}}|d kr | dn|d }| t||d d d |f }	|d k	rp| ddkrftd|	| }	n|	d | }	|| j }t||dd|	 }
|d k	r|
d| 	d
tjtd}
| |
|
}|S )Nr   r   rI   -inf)r<   rM   r   r;   r   r.   r   r   masked_fillri   ro   r   rh   r   r   )r4   r   r   r   r   r   r   r   r   rT   r   r   r7   r7   r8   softmax_attention\  s     

"z)MegaGatedCrossAttention.softmax_attentionNF)r   valuer   past_key_valuesoutput_attentionsr   r   c                 C   s  |  \}}	}
|
| jjkr2td|
 d| jj |dk	r|dkrPtd| |dd \}}d }}|d }| dd }nd }}|r|dkrdnd}|}| jr| |}| |}tj|| jj| jj| jj	gdd	\}}}t
|}t|}|dkr|dk	rtd
d }}n| |}| | |}|dd}|dk	rX|dd}|dk	rn|dd}|dk	r|}|}|r|}|}| d}|dk	r| dkrd}|dk	r| d|	krtd| d|krtd| jdkr| ||||}n| ||||}| j|dd}| |}t||dd}| | || }| |}t|||| }| js| |}|r||fn|f}|r|||f }|S )a
  
        Gated cross-attention used in Mega

        Args:
            query (`torch.Tensor` of shape `(target_sequence_length, batch_size, hidden_size)`):
                The self (or target) sequence input used as query inputs for cross-attention
            key (`torch.Tensor` of shape `(source_sequence_length, batch_size, hidden_size)`):
                The cross (or source) sequence input with shape used as keys in cross-attention
            value (`torch.Tensor` of shape `(source_sequence_length, batch_size, hidden_size)`):
                The cross (or source) sequence input with shape used as values in cross-attention
            key_padding_mask (`torch.LongTensor` of shape `(batch_size, source_sequence_length)`, *optional*):
                Padding mask corresponding to the source sequence, where entries are 1 for *not masked* and 0 for
                *masked* tokens
            past_key_values (`tuple(torch.FloatTensor)`, *optional*):
                If provided, the hidden state returned from the previous timestep during incremental decoding; expects
                that prior cross-attention keys and values will be the last two items in the tuple
            output_attentions (`bool`, defaults to `False`):
                Whether or not to return the cross-attention weights.
            use_cache (`bool`, defaults to `False`):
                Whether to perfom incremental decoding; uses `prev_state` as the prior timestep, and returns the
                updated EMA hidden state for use in the next step

        Returns:
            `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
            inputs:
            - **hidden_states** (`torch.FloatTensor` of shape `(target_sequence_length, batch_size, hidden_size)`) --
              Hidden states from target sequence updated by gated cross-attention
            - **attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape
              `(batch_size, source_sequence_length, target_sequence_length)` -- The pairwise cross-attention weights
              corresponding to each token in the source and target sequences
            - **cross_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
              source_sequence_length, config.shared_representation_size)` -- The cross-attention key state for use in
              the next step of incremental decoding
            - **cross_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
              source_sequence_length, config.hidden_size)` -- The cross-attention value state for use in the next step
              of incremental decoding
        r   z but expected Nr   z>Incremental decoding requested with self-sequence length > 1: r|   r   r#   rl   z+Key and value must be `None` simultaneouslyz6Key padding mask does not align on the batch dimensionz@Key padding mask does not align on the sequence length dimensionr   Trz   )r<   r   r)   r;   r   r   r   r.   r   r[   r   rO   r   r   r   r   r   rm   r   r   r   r   r   r   r   r   addcmul)r4   r   r   r   r   r   r   r   rS   r   r\   Zprev_cross_keyZprev_cross_valueprev_self_keyZnum_incremental_stepsZ
full_queryZquery_projectedr   Ztarget_gateZattention_queryZprojected_keyZprojected_valueZupdated_cross_keyZupdated_cross_valuectx_lenr   r   Zweighted_targetsr   outputsr7   r7   r8   rA   v  s    0












      


zMegaGatedCrossAttention.forward)NNFF)rC   rD   rE   rF   r   r&   r   r   r   r.   rL   r   r   rA   rG   r7   r7   r5   r8   r     s"   &    r   c                       s`   e Zd ZdZed fddZdd Zdd Zdee	j
 ee	j
 eee	j
  dddZ  ZS )MegaMovingAverageGatedAttentionaz  
    Pure PyTorch implementation of Mega block; see https://arxiv.org/abs/2209.10655 and original fairseq implementation
    at https://github.com/facebookresearch/mega (copyright Meta Research, licensed under MIT License)

    Differences from original implementation include hidden state refactor and fixed inconsistency with additive /
    multiplicative attention masks
    r   c                    s  t    || _t| jj | _| jjdkr6| jjd nd | _t| jj	| jj
d| _t| jj| jj
d| _t| jjdd| _t| jj| jj| jjd| _t|| _t| jj| jj| _t| jj| jj| jj d| jj  | _t| jj| jj| _ttd| jj| _ ttd| jj| _!| jj"dkr@t#|| _$n,| jj"dkrZt%|| _$nt&d	| jj" tj'd
d| _(| jjdkr| j)n| j*| _+d S )Nr   r   r   Fr   rI   r   rr   z"Unknown relative positional bias: r#   rl   ),r%   r&   r   r   r   r   r[   r   rv   r   r   r   r   r   r   r   r   r   r)   r   r   r   ema_gater   r   intermediate_sizer   mx_projr   rK   r.   rL   	qk_weightqk_biasr   rH   rM   rW   r;   r   r   r   r   attention_functionr3   r5   r7   r8   r&     sB    
   
z(MegaMovingAverageGatedAttention.__init__c           
      C   s   | d}|dk	r4|jddd}|jddd}n|}|dk	rN|jddd}| |}|| dkr| dd	kr|td
|dd }t||dd| | }t	| j
j ||}	|dk	r|	|d }	|dk	r|	| }	|	S )a  
        Apply element-wise attention via relu^2 or laplace. Same as original implementation but with standardized
        causal attention mask. Expects the Hugging Face standard attention mask paradigm: 1 for not masked, and 0 for
        masked.
        rI   Nr#   T)r   r   )r   r   r   z2Size mismatch between Q and K in element attentionr
   )r<   r   clampri   rM   r;   r.   matmulr   r   r   r   r   )
r4   r   r   padding_maskcausal_maskrS   r   rT   r   r   r7   r7   r8   r   G  s&    

z1MegaMovingAverageGatedAttention.element_attentionc                 C   s   | d}| |}|| dkrD| ddkr8td|dd }|| j }t||dd| }|dk	rtj||jd}|	d| 
 td}|| }|dk	rd| }|jdd	d
}	t||	 }|	|dtj
td}| ||}
|
S )zEStandard softmax self-attention, as in the original Transformer paperrI   r   z2Size mismatch between Q and K in softmax attentionr#   Nr
   r!   r   Tr   )r<   rM   r;   r   r.   r  r   Z
zeros_liker"   r   r   rh   alllogical_andri   ro   r   r   )r4   r   r   r	  r
  rS   rT   r   Zadditive_causal_maskZpadding_mask_allr   r7   r7   r8   r   n  s&    


z1MegaMovingAverageGatedAttention.softmax_attentionNF)r	  r
  r   c           #      C   s  |  \}}}	|	| jjkr2td| jj d|	 |}
| jjrH| |}| | |}| jjr|dk	r|dkr~td| |dd \}}}nd } }}| j	||||d\}}| 
|}| |}tj|| jj| jj| jj | jjgd	d
\}}}t|}t|}tj|| jj| jjgd	d
\}}|d| j | j }tj|dd
\}}|dd}|dd}|dd}| jjr|dk	rtj||gdd
}|dk	rtj||gdd
}| jjs|}|}n.| d| jj }|dkrd}d}n|}|}| d}| jjsD|d}|d}|d}|dk	r|d}n|| jjk r^|d}n$|| jj }|||| jj| jj}|| jjk r|d}|d}|dk	r|d}nX|| jj }|||| jj| jj}|||| jj| jj}|dk	r|||| jj}|dk	r.| dkr.d}| j||||d}| j|dd}|  |}t!||||| jjdd} | || "| |  } | 
| } t#|
|| |
 }!| jjs| |!}!|r|!|fn|!f}"| jjr|"|||f }"|"S )a  
        Mega's self-attention block, which combines multi-headed EMA with traditional self-attention

        Args:
            input (`torch.Tensor` of shape `(sequence_length, batch_size, hidden_size)`):
                Hidden states to be updated by Mega's self-attention
            padding_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked*
                or 0 for *masked*
            causal_mask (`torch.LongTensor` of shape `(sequence_length, sequence_length)`, *optional*):
                Indicates which inputs are to be ignored due to causal attention, where elements are either 1 for *not
                masked* or 0 for *masked*
            past_key_values (`tuple(torch.Tensor)`, *optional*):
                The hidden states returned from the previous timestep during incremental decoding; expects that
                self-attention key, value, and EMA states are the first 3 entries in the tuple
            output_attentions (`bool`, default `False`):
                Whether to return self-attention weights
            use_cache (`bool`, default `False`):
                Whether to perfom incremental decoding; uses `past_key_values` as prior state, and returns the updated
                states for use in the next step

        Returns:
            `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
            inputs:
            - **hidden_states** (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`) -- Hidden
              states from target sequence updated by Mega's self-attention
            - **attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape
              `(batch_size, 1, sequence_length, sequence_length)` -- The self-attention weights corresponding to how
              each token in the input sequence attends to every other token
            - **self_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
              sequence_length, config.shared_representation_size)` -- The self-attention key state for use in the next
              step of incremental decoding
            - **self_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
              sequence_length, config.hidden_size)` -- The self-attention value state for use in the next step of
              incremental decoding
            - **self_ema_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape
              `(batch_size, config.ndim)` The incremental EMA state for use in the next step of incremental decoding.
        z$Input embedding dimension should be z; received Nr   zGIncremental decoding only supports self sequence length of 1; received r   r
   )r   r   r   r#   rl   rI   )r	  r
  Trz   )$r<   r   r)   r;   r   r   r   r   
is_decoderr  r   r  r.   r   r[   r  r   rO   r   ri   r  r  Zunbindr   rp   use_chunkingrJ   ZreshaperR   rm   r  r   r   r  r   r   )#r4   rq   r	  r
  r   r   r   rS   r   r\   r   r   r   Zprev_self_valueZprev_ema_stateZema_outZupdated_ema_statebaser   Zquery_key_gatesZintermediate_stateZ	query_keyZattention_gater   r   Zupdated_self_keyZupdated_self_valueZcurr_lenr   Zn_chunksr   r   Zweighted_self_outputr   Zreturn_valuesr7   r7   r8   rA     s    0
   




  
















 



z'MegaMovingAverageGatedAttention.forward)NNNFF)rC   rD   rE   rF   r   r&   r   r   r   r.   rL   r   rA   rG   r7   r7   r5   r8   r     s   )')     r   c                       s.   e Zd ZdZed fddZdd Z  ZS ) MegaNormalizedFeedForwardNetworkz
    Normalized feed-forward network used in Mega blocks. Left as-is from original Mega repo aside from retrieving args
    from Hugging Face config
    r   c                    s   t    || _|j| _|j| _t|j | _t| jj	| jj
d| _t| jj| jj
d| _| jj| _t| jj| jj| jjd| _t| jj| jj| _t| jj| jj| _d S )Nr   r   )r%   r&   r   Znffn_hidden_sizeZ
hidden_dimr   Zact_fnr   rv   r   r   r   Znffn_activation_dropout_probr   Znormalize_before_ffnr   r   r   r)   r   r   r   r   fc1fc2r3   r5   r7   r8   r&   k  s$    
 
  z)MegaNormalizedFeedForwardNetwork.__init__c                 C   s^   |}| j r| |}| | |}| |}| |}| |}|| }| j sZ| |}|S rw   )r   r   r   r  r   r  r   )r4   r   r   Zhiddenr   r7   r7   r8   rA     s    




z(MegaNormalizedFeedForwardNetwork.forwardrB   r7   r7   r5   r8   r  e  s   r  c                       sp   e Zd Zed fddZd	ejeej eej eej	 eej	 ee
ej	  ee ee
ej d	ddZ  ZS )
	MegaBlockr   c                    sn   t    d| _t|| _|jr(t|nd | _|j| _|j	| _	| j	rd| jsXt
|  dt|| _nd | _d S )Nr   z> should be used as a decoder model if cross attention is added)r%   r&   Zseq_len_dimr   
mega_layerZuse_normalized_ffnr  nffnr  add_cross_attentionr;   r   
cross_attnr3   r5   r7   r8   r&     s    

zMegaBlock.__init__NF)	hidden_statesr   r
  encoder_hidden_statesencoder_attention_maskpast_key_valuer   r   r   c	              	   C   sP  |r,|dk	r,|dk	r,|dddf  d}	n|}	| j||	||||d}
|
d }|r^|
dd nd\}}}|rt|
d nd}| jdk	r|dkrtd| j|||||||d	}|d }|r|d
d nd\}}|r|d nd}| jdk	r| |}|f}|r||f }| jdk	r||f }|rL|||f}| jdk	rB|||f }||f }|S )a   
        A single Mega layer: either encoder or decoder, with optional cross-attention and optional normalized
        feed-forward layer

        Args:
            hidden_states (`torch.Tensor` of shape `(target_sequence_length, batch_size, hidden_size)`):
                Hidden states to be updated by the Mega block
            attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
                Indicates which entries in the self/target sequence are to be ignored (mostly due to padding), where
                elements are either 1 for *not masked* or 0 for *masked*. Causal attention is enforced internally.
            causal_mask (`torch.LongTensor` of shape `(sequence_length, sequence_length)`, *optional*):
                Indicates which inputs are to be ignored due to causal attention, where elements are either 1 for *not
                masked* or 0 for *masked*
            encoder_hidden_states (`torch.Tensor`, of shape `(source_sequence_length, batch_size, hidden_size)`, *optional*):
                Encoder hidden states to be used for cross-attention (and required for encoder-decoder model setup)
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, source_sequence_length)`, *optional*):
                Indicates which entries in the cross/source sequence are to be ignored (mostly due to padding), where
                elements are either 1 for *not masked* or 0 for *masked*.
            past_key_value (`tuple(torch.Tensor)`, *optional*):
                The hidden states returned from the previous timestep during incremental decoding; expects that
                self-attention key, value, and EMA states are the first 3 entries in the tuple, and (if doing
                cross-attention) cross-attention key and value are the last 2 entries in the tuple
            output_attentions (`bool`, default `False`):
                Whether to return self-attention weights
            use_cache (`bool`, default `False`):
                Whether to perfom incremental decoding; uses `past_key_value` as prior state, and returns the updated
                states for use in the next step

        Returns:
            `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
            inputs:
            - **hidden_states** (`torch.FloatTensor` of shape `(target_sequence_length, batch_size, hidden_size)`) --
              Hidden states from target sequence updated by Mega
            - **self_attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape
              `(batch_size, 1, target_sequence_length, target_sequence_length)` -- The self-attention weights
              corresponding to how each token in the input sequence attends to every other token
            - **cross_attn_weights** (*optional*, returned when `output_attentions=True` and
              `config.add_cross_attention=True`) `torch.FloatTensor` of shape `(batch_size, source_sequence_length,
              target_sequence_length)` -- Pairwise cross-attention weights between every entry in the source sequence
              and target sequence
            - **self_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
              sequence_length, config.shared_representation_size)` -- The self-attention key state for use in the next
              step of incremental decoding
            - **self_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
              sequence_length, config.hidden_size)` -- The self-attention value state for use in the next step of
              incremental decoding
            - **self_ema_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape
              `(batch_size, config.ndim)` The incremental EMA state for use in the next step of incremental decoding.
            - **cross_key** (*optional*, returned when `use_cache=True` and `config.is_decoder=True`)
              `torch.FloatTensor` of shape `(batch_size, source_sequence_length, config.shared_representation_size)` --
              The cross-attention key state for use in the next step of incremental decoding
            - **cross_value** (*optional*, returned when `use_cache=True` and `config.is_decoder=True`)
              `torch.FloatTensor` of shape `(batch_size, source_sequence_length, config.hidden_size)` -- The
              cross-attention value state for use in the next step of incremental decoding
        Nr#   )rq   r	  r
  r   r   r   r   )NNNr   zARequested cross-attention without providing encoder hidden states)r   r   r   r   r   r   r   r|   )NN)ri   r  r  r;   r  )r4   r  r   r
  r  r  r  r   r   Zmega_padding_maskmega_outputsZnew_hidden_statesZself_keyZ
self_valueZself_ema_stateZself_attention_weightsZcross_attn_outputsZ	cross_keyZcross_valueZcross_attention_weightsZoutsZnew_key_valuesr7   r7   r8   rA     sZ    G	





zMegaBlock.forward)NNNNNFF)rC   rD   rE   r   r&   r.   rL   r   
LongTensorrb   r   r   rA   rG   r7   r7   r5   r8   r    s&          r  c                       s0   e Zd Z fddZejejdddZ  ZS )
MegaPoolerc                    s*   t    t|j|j| _t | _d S rw   )r%   r&   r   r   r)   denseTanhr   r3   r5   r7   r8   r&   )  s    
zMegaPooler.__init__)r  r   c                 C   s(   |d d df }|  |}| |}|S Nr   )r   r   )r4   r  Zfirst_token_tensorpooled_outputr7   r7   r8   rA   .  s    

zMegaPooler.forward)rC   rD   rE   r&   r.   rL   rA   rG   r7   r7   r5   r8   r  (  s   r  c                   @   s*   e Zd ZdZeZdZdZdgZdd Z	dS )MegaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    megaFr   c              	   C   s~  t |trt  tjj|jd| jj	d tjj|j
d| jj	d t| jjd}| jjdkrtttd| jjd}|d|d |jjd| jjd| tjj|jd| jjd tjj|jd| jjd W 5 Q R X nt |trtjj|jd| jjd nrt |trHtjj|jd| jjd tjj|jd| jjd n2t |trr| jjrztj|j d nt |t!r| jjrztj|j"d nt |t#rtjj|j$d| jjd tj|j%d nt |tj&r|j"j'jd| jjd |j(dk	rz|j(j')  nlt |tj*rR|j"j'jd| jjd |j+dk	rz|j"j'|j+ )  n(t |tj,rz|j(j')  |j"j'-d dS )	zInitialize the weightsrY   )r   Zstdr   rI   r   g      r   N).r   r   r.   Zno_gradr   initZnormal_r   r   Zema_delta_alpha_ranger   onesr   tensorlistrangeZindex_fill_r   Zema_beta_rangeZadd_r   Zema_gamma_omega_ranger   rH   rM   Zinitializer_rangerW   r`   ra   r   r   Z	constant_r   r   r   r   r  r  r   datarT   Zzero_r'   r   r   Zfill_)r4   modulevalidxr7   r7   r8   _init_weightsB  sH    

&

z!MegaPreTrainedModel._init_weightsN)
rC   rD   rE   rF   r   config_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesr/  r7   r7   r7   r8   r$  7  s   r$  a>  

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`MegaConfig`]): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `add_token_type_embeddings` parameter
            set to `True`. All the value in this tensor should be always < config.type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z^The bare MEGA Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd ZdZded fddZdd Zdd	 Zee	
d
eeeeddeej eej eej eej eej eej eeej  ee ee ee ee eeej ef dddZ  ZS )	MegaModela  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added after self-attention, following the architecture described in *Mega: Moving Average
    Equipped Gated Attention*_ by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig,
    Jonathan May, and Luke Zettlemoyer

    To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to
    `True` and `bidirectional` set to `False`. To be used in a Seq2Seq model, the model needs to initialized with both
    `is_decoder=True` and `bidirectional=False` argument as well as `add_cross_attention` set to `True`; an
    `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Mega: Moving Average Equipped Gated Attention*: https://arxiv.org/abs/2209.10655

    Tr   c                    sZ   t     | _t | _t fddt jD | _	|rHt
 nd | _|   d S )Nc                    s   g | ]}t  qS r7   )r  ).0r   r   r7   r8   
<listcomp>  s     z&MegaModel.__init__.<locals>.<listcomp>)r%   r&   r   r   embedding_layerr   Z
ModuleListr*  num_hidden_layerslayersr  pooler	post_init)r4   r   add_pooling_layerr5   r   r8   r&     s    
 zMegaModel.__init__c                 C   s   | j jS rw   r4  r*   r   r7   r7   r8   get_input_embeddings  s    zMegaModel.get_input_embeddingsc                 C   s   || j _d S rw   r:  )r4   r   r7   r7   r8   set_input_embeddings  s    zMegaModel.set_input_embeddingsbatch_size, sequence_length
checkpointoutput_typer0  N)r>   r   r    r?   r  r  r   r   r   output_hidden_statesreturn_dictr   c                  C   s  |	dk	r|	n| j j}	|
dk	r |
n| j j}
|dk	r4|n| j j}|dk	rV|dk	rVtdnL|dk	rz| || | }|j}n(|dk	r| dd }|j}ntd| j jrt	
|d | j jg}|\}}| j jr
|| j jkr
|| j j dkr
td| d| j j | j jrX|dk	r"|n| j j}t	jd|ft	j|d	}| ||}|d}nd
}d}|dk	rt|| j jkrtd| j j dt| | j|||d}|dd}|dk	r|dd}|
r|fnd}|	rdnd}|	r| j jrdnd}|r
dnd}t| jD ]\}}|dk	r2|| nd}||||||||	|d}|d }|
rn||ddf7 }|	r|d }||f7 }| j jr|d }||f7 }|r|d }||f7 }q|dd}| jdk	r| |nd}|s||f||||f S t||||||dS )a  
        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer#   z5You have to specify either input_ids or inputs_embedsr   zconfig.use_chunking is activated; input sequence length must be shorter than or a multiple of config.chunk_size
received sequence length of z with chunk size r   r9   Fz;Received past key/value cache with size mismatch; expected z, received )r>   r    r?   r7   )r  r   r
  r  r  r  r   r   rI   )Zlast_hidden_stateZpooler_outputr   r  
attentionscross_attentions)r   r   rA  use_return_dictr;   Z%warn_if_padding_and_no_attention_maskr<   r:   r  r.   r(  rJ   r  r   r'  r1   Z*create_extended_attention_mask_for_decoderr   lenr5  r4  r   r  	enumerater6  r7  r   ) r4   r>   r   r    r?   r  r  r   r   r   rA  rB  r@   r:   Z
batch_sizeZsequence_lengthZtemp_mask_for_extensionr
  Zembedding_outputr  Zall_hidden_statesZall_self_attentionsZall_cross_attentionsZnext_decoder_cacheir  Zcurrent_decoder_cacher  Zself_attn_weightsZcross_attn_weightsZupdated_cacher#  r7   r7   r8   rA     s    )

  



zMegaModel.forward)T)NNNNNNNNNNN)rC   rD   rE   rF   r   r&   r;  r<  r   MEGA_INPUTS_DOCSTRINGrN   r   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r.   rL   r   rb   r   r   r   rA   rG   r7   r7   r5   r8   r1    sF              r1  zFMEGA Model with a `language modeling` head on top for CLM fine-tuning.c                       s   e Zd ZdgZed fddZdd Zdd Zee	
d	eeed
deej eej eej eej eej eej eej eeej  ee ee ee ee eeej ef dddZdddZdd Z  ZS )MegaForCausalLMzlm_head.weightr   c                    sx   t  | |jstd t|dd| _|jrNt	|j
|j
| _t | _nd | _d | _t	|j
|j| _|   d S )NzLIf you want to use `MegaForCausalLM` as a standalone, add `is_decoder=True.`Fr9  )r%   r&   r  loggerwarningr1  r%  add_lm_hidden_dense_layerr   r   r)   r   r!  hidden_activationr(   lm_headr8  r3   r5   r7   r8   r&   v  s    
zMegaForCausalLM.__init__c                 C   s   | j S rw   rR  r   r7   r7   r8   get_output_embeddings  s    z%MegaForCausalLM.get_output_embeddingsc                 C   s
   || _ d S rw   rS  r4   Znew_embeddingsr7   r7   r8   set_output_embeddings  s    z%MegaForCausalLM.set_output_embeddingsr=  )r@  r0  N)r>   r   r    r?   r  r  labelsr   r   r   rA  rB  r   c                 C   s  |dk	r|n| j j}|dk	r d}	| j||||||||	|
||d}|d }| jdk	rf| |}| |}| |}d}|dk	r|ddddddf  }|ddddf  }t }||d| j j	|d}|s|f|dd  }|dk	r|f| S |S t
|||j|j|j|jdS )	a
  
        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MegaForCausalLM, AutoConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("mnaylor/mega-base-wikitext")
        >>> config = AutoConfig.from_pretrained("mnaylor/mega-base-wikitext")
        >>> config.is_decoder = True
        >>> config.bidirectional = False
        >>> model = MegaForCausalLM.from_pretrained(
        ...     "mnaylor/mega-base-wikitext", config=config, ignore_mismatched_sizes=True
        ... )

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NF)
r   r    r?   r  r  r   r   r   rA  rB  r   r#   r   rI   )losslogitsr   r  rC  rD  )r   rE  r%  r   rQ  rR  
contiguousr   rR   r(   r   r   r  rC  rD  )r4   r>   r   r    r?   r  r  rW  r   r   r   rA  rB  r   sequence_outputprediction_scoresZlm_lossZshifted_prediction_scoresloss_fctr   r7   r7   r8   rA     sL    ?



zMegaForCausalLM.forwardc                 K   s@   |j }|d kr||}|d k	r4|d d dd f }|||dS )Nr#   )r>   r   r   )shapeZnew_ones)r4   r>   r   r   Zmodel_kwargsr@   r7   r7   r8   prepare_inputs_for_generation  s    
z-MegaForCausalLM.prepare_inputs_for_generationc                    s.   d}|D ] }|t  fdd|D f7 }q|S )Nr7   c                 3   s"   | ]}| d  |jV  qdS )r   N)Zindex_selectro   r:   )r2  r   beam_idxr7   r8   	<genexpr>  s     z1MegaForCausalLM._reorder_cache.<locals>.<genexpr>)tuple)r4   r   ra  Zreordered_pastZ
layer_pastr7   r`  r8   _reorder_cache
  s    zMegaForCausalLM._reorder_cache)NNNNNNNNNNNN)NN)rC   rD   rE   _tied_weights_keysr   r&   rT  rV  r   rI  rN   r   r   rK  r   r.   r  rb   r   r   r   rL   rA   r_  rd  rG   r7   r7   r5   r8   rL  p  sF   
            l
rL  z2MEGA Model with a `language modeling` head on top.c                       s   e Zd ZdgZed fddZdd Zdd Zee	
d	eeeed
ddddeej eej eej eej eej eej eej ee ee ee eeej ef dddZ  ZS )MegaForMaskedLMzmlm_head.weightr   c                    s   t  | |jrtd t|dd| _|jrNt	|j
|j
| _t | _nd | _d | _t	|j
|j| _t|j| _|   d S )NzfIf you want to use `MegaForMaskedLM`, set `config.is_decoder=False` for bi-directional self-attention.FrM  )r%   r&   r  rN  rO  r1  r%  rP  r   r   r)   r   r!  rQ  r(   mlm_headDropoutr   r   r8  r3   r5   r7   r8   r&     s    zMegaForMaskedLM.__init__c                 C   s   | j S rw   rg  r   r7   r7   r8   rT  -  s    z%MegaForMaskedLM.get_output_embeddingsc                 C   s
   || _ d S rw   ri  rU  r7   r7   r8   rV  0  s    z%MegaForMaskedLM.set_output_embeddingsr=  z<mask>z' Paris'g?)r?  r@  r0  maskZexpected_outputZexpected_lossN)r>   r   r    r?   r  r  rW  r   rA  rB  r   c                 C   s   |
dk	r|
n| j j}
| j||||||||	|
d	}|d }| jdk	rV| |}| |}| |}d}|dk	rt }||d| j j|d}|
s|f|dd  }|dk	r|f| S |S t	|||j
|jdS )a&  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
            Used to hide legacy arguments that have been deprecated.
        N)r   r    r?   r  r  r   rA  rB  r   r#   rI   rX  rY  r  rC  )r   rE  r%  r   rQ  rg  r   rR   r(   r   r  rC  )r4   r>   r   r    r?   r  r  rW  r   rA  rB  r   r[  r\  Zmasked_lm_lossr]  r   r7   r7   r8   rA   3  s<    



zMegaForMaskedLM.forward)
NNNNNNNNNN)rC   rD   rE   re  r   r&   rT  rV  r   rI  rN   r   rJ  r   rK  r   r.   r  rb   r   r   r   rL   rA   rG   r7   r7   r5   r8   rf    sH   
          rf  z
    MEGA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                       s   e Zd Z fddZeedeee	e
dd	eej eej eej eej eej ee ee ee eeej e	f d	ddZ  ZS )
MegaForSequenceClassificationc                    s>   t  | |j| _|| _t|dd| _t|| _|   d S NFrM  )	r%   r&   
num_labelsr   r1  r%  MegaClassificationHead
classifierr8  r3   r5   r7   r8   r&   }  s    
z&MegaForSequenceClassification.__init__r=  r>  N	r>   r   r    r?   rW  r   rA  rB  r   c	              	   C   sn  |dk	r|n| j j}| j|||||||d}	|	d }
| |
}d}|dk	r*| j jdkr| jdkrld| j _n4| jdkr|jtjks|jtj	krd| j _nd| j _| j jdkrt
 }| jdkr|| | }n
|||}nN| j jdkrt }||d| j|d}n| j jdkr*t }|||}|sZ|f|	d	d  }|dk	rV|f| S |S t|||	j|	jd
S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r    r?   r   rA  rB  r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationr#   rI   rk  )r   rE  r%  rp  Zproblem_typern  r"   r.   r1   ru   r	   r   r   rR   r   r   r  rC  r4   r>   r   r    r?   rW  r   rA  rB  r   r[  rY  rX  r]  r   r7   r7   r8   rA     sP    	



"


z%MegaForSequenceClassification.forward)NNNNNNNN)rC   rD   rE   r&   r   rI  rN   r   rJ  r   rK  r   r.   r  rb   r   r   r   rL   rA   rG   r7   r7   r5   r8   rl  u  s4           rl  z
    MEGA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       s   e Zd Z fddZeedeee	e
dd	eej eej eej eej eej ee ee ee eeej e	f d	ddZ  ZS )
MegaForMultipleChoicec                    s@   t  | t|| _t|j| _t|j	d| _
|   d S )Nr   )r%   r&   r1  r%  r   rh  r   r   r   r)   rp  r8  r3   r5   r7   r8   r&     s
    
zMegaForMultipleChoice.__init__z(batch_size, num_choices, sequence_lengthr>  N)	r>   r    r   rW  r?   r   rA  rB  r   c	              	   C   sR  |dk	r|n| j j}|dk	r&|jd n|jd }	|dk	rJ|d|dnd}
|dk	rh|d|dnd}|dk	r|d|dnd}|dk	r|d|d|dnd}| j|
||||||d}|d }| |}| |}|d|	}d}|dk	rt }|||}|s>|f|dd  }|dk	r:|f| S |S t	|||j
|jdS )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r#   r|   )r    r   r?   r   rA  rB  rI   rk  )r   rE  r^  rR   r<   r%  r   rp  r   r   r  rC  )r4   r>   r    r   rW  r?   r   rA  rB  Znum_choicesZflat_input_idsZflat_token_type_idsZflat_attention_maskZflat_inputs_embedsr   r#  rY  Zreshaped_logitsrX  r]  r   r7   r7   r8   rA     sF    	



zMegaForMultipleChoice.forward)NNNNNNNN)rC   rD   rE   r&   r   rI  rN   r   rJ  r   rK  r   r.   r  rb   r   r   r   rL   rA   rG   r7   r7   r5   r8   rt    s4   
        rt  z
    MEGA Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       s   e Zd Z fddZeedeee	e
dd	eej eej eej eej eej ee ee ee eeej e	f d	ddZ  ZS )
MegaForTokenClassificationc                    sb   t  | |j| _t|dd| _|jd k	r2|jn|j}t|| _	t
|j|j| _|   d S rm  )r%   r&   rn  r1  r%  classifier_dropoutr   r   rh  r   r   r)   rp  r8  r4   r   rv  r5   r7   r8   r&   -  s    z#MegaForTokenClassification.__init__r=  r>  Nrq  c	              	   C   s   |dk	r|n| j j}| j|||||||d}	|	d }
| |
}
| |
}d}|dk	rtt }||d| j|d}|s|f|	dd  }|dk	r|f| S |S t|||	j	|	j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nrr  r   r#   rI   rk  )r   rE  r%  r   rp  r   rR   rn  r   r  rC  rs  r7   r7   r8   rA   ;  s4    


z"MegaForTokenClassification.forward)NNNNNNNN)rC   rD   rE   r&   r   rI  rN   r   rJ  r   rK  r   r.   r  rb   r   r   r   rL   rA   rG   r7   r7   r5   r8   ru  %  s4           ru  c                       s(   e Zd ZdZ fddZdd Z  ZS )ro  z-Head for sentence-level classification tasks.c                    sT   t    t|j|j| _|jd k	r,|jn|j}t|| _	t|j|j
| _d S rw   )r%   r&   r   r   r)   r   rv  r   rh  r   rn  out_projrw  r5   r7   r8   r&   v  s    
zMegaClassificationHead.__init__c                 K   sL   |d d dd d f }|  |}| |}t|}|  |}| |}|S r"  )r   r   r.   tanhrx  )r4   featureskwargsxr7   r7   r8   rA     s    




zMegaClassificationHead.forwardr   r7   r7   r5   r8   ro  s  s   	ro  z
    MEGA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       s   e Zd Z fddZeedeee	e
dd	eej eej eej eej eej eej ee ee ee eeej e	f d
ddZ  ZS )
MegaForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S rm  )
r%   r&   rn  r1  r%  r   r   r)   
qa_outputsr8  r3   r5   r7   r8   r&     s
    z!MegaForQuestionAnswering.__init__r=  r>  N)
r>   r   r    r?   start_positionsend_positionsr   rA  rB  r   c
              	   C   sL  |	dk	r|	n| j j}	| j|||||||	d}
|
d }| |}|jddd\}}|d }|d }d}|dk	r|dk	rt| dkr|d}t| dkr|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|	s6||f|
dd  }|dk	r2|f| S |S t||||
j|
jd	S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nrr  r   r   r#   rl   )Zignore_indexrI   )rX  start_logits
end_logitsr  rC  )r   rE  r%  r~  r   r   rZ  rF  r<   r  r   r   r  rC  )r4   r>   r   r    r?   r  r  r   rA  rB  r   r[  rY  r  r  Z
total_lossZignored_indexr]  Z
start_lossZend_lossr   r7   r7   r8   rA     sL    







z MegaForQuestionAnswering.forward)	NNNNNNNNN)rC   rD   rE   r&   r   rI  rN   r   rJ  r   rK  r   r.   r  rb   r   r   r   rL   rA   rG   r7   r7   r5   r8   r}    s8   
         r}  )GrF   rd   typingr   r   r   r   r.   Ztorch.nn.functionalr   Z
functionalrO   Ztorch.utils.checkpointZtorch.nnr   r   r	   Zactivationsr   Zmodeling_outputsr   r   r   r   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   utilsr   r   r   r   r   Zconfiguration_megar   Z
get_loggerrC   rN  rJ  rK  Z"MEGA_PRETRAINED_MODEL_ARCHIVE_LISTModuler   rH   rW   rv   r   r   r   appendr   r   r   r  r  r  r$  ZMEGA_START_DOCSTRINGrI  r1  rL  rf  rl  rt  ru  ro  r}  r7   r7   r7   r8   <module>   s   $	
65&!
 R    R- 8( F  !aTNG