U
    9%e>                     @   s  d Z ddlZddlmZmZmZmZmZmZ ddl	Z	ddl	m
Z
mZ ddlmZmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZmZmZmZm Z  ddl!m"Z" e#e$Z%dZ&dZ'dZ(dZ)dZ*dd Z+d;ddZ,dde	j-fddZ.G dd deZ/dd Z0dd Z1dd  Z2d<d!d"Z3G d#d$ d$ej4Z5G d%d& d&ej4Z6G d'd( d(ej4Z7G d)d* d*ej4Z8d+d, Z9G d-d. d.ej4Z:d/d0 Z;d1d2 Z<ed3e(G d4d5 d5e/Z=ed6e(G d7d8 d8e/Z>G d9d: d:ej?Z@dS )=z`PyTorch Fairseq model, ported from https://github.com/pytorch/fairseq/tree/master/examples/wmt19    N)AnyDictListOptionalTupleUnion)Tensornn)CrossEntropyLoss	LayerNorm   )ACT2FN)is_deepspeed_zero3_enabled)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)add_code_sample_docstringsadd_end_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )
FSMTConfigzfacebook/wmt19-ru-enr   a?  

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`FSMTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.

u  
    Translation example::

    ```python
    >>> from transformers import AutoTokenizer, FSMTForConditionalGeneration

    >>> mname = "facebook/wmt19-ru-en"
    >>> model = FSMTForConditionalGeneration.from_pretrained(mname)
    >>> tokenizer = AutoTokenizer.from_pretrained(mname)

    >>> src_text = "Машинное обучение - это здорово, не так ли?"
    >>> input_ids = tokenizer(src_text, return_tensors="pt").input_ids
    >>> outputs = model.generate(input_ids, num_beams=5, num_return_sequences=3)
    >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
    "Machine learning is great, isn't it?"
    ```

a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`FSTMTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            FSMT uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        encoder_outputs (`Tuple(torch.FloatTensor)`, *optional*):
            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden-states at
            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
        past_key_values (`Tuple(torch.FloatTensor)` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
            input (see `past_key_values`). This is useful if you want more control over how to convert
            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.

            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
            of `inputs_embeds`.
        use_cache (`bool`, *optional*, defaults to `True`):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                 C   s   |   dkst| dS )z+Turns 1->0, 0->1, False->True, True-> False   r   )dimAssertionErroreq)attention_mask r!   e/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/fsmt/modeling_fsmt.pyinvert_mask+  s    r#   c                 C   sT   | j d }tj|| jd}|||}|d}|r<|| }||k}| |dkdS )Nr   device)shapetorcharanger%   expand	unsqueezemasked_fill)xZdiagonallr)   maskr!   r!   r"   	triu_onnx1  s    

r0   c           	      C   sn   | j }|dkrt||}| \}}|dkr8t||}nt|}tttj|||ddj	|j
d}|||fS )z
    Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if none are provided.
    This mimics the default behavior in fairseq. To override it pass in masks. Note: this is not called during
    generation
    Ndtyper   r$   )pad_token_idshift_tokens_rightsizemake_padding_maskr#   r0   fill_with_neg_infr(   zerostor%   )	config	input_idsdecoder_input_idsdecoder_padding_maskcausal_mask_dtyper3   bsztgt_lencausal_maskr!   r!   r"   _prepare_fsmt_decoder_inputs<  s    
rB   c                   @   s(   e Zd ZeZdZdd Zedd ZdS )PretrainedFSMTModelmodelc                 C   s   | j j}t|tjr>|jjjd|d |jd k	r|jj	  nFt|t
rJn:t|tjr|jjjd|d |jd k	r|jj|j 	  d S )N        )Zmeanstd)r:   Zinit_std
isinstancer	   LinearweightdataZnormal_biasZzero_SinusoidalPositionalEmbedding	Embeddingpadding_idx)selfmodulerF   r!   r!   r"   _init_weightsZ  s    


z!PretrainedFSMTModel._init_weightsc                 C   sD   | j j}tjdddddgdddd|gg| jd}|||d	}|S )
Nr      
      r         r$   )r    r;   )r:   r3   r(   tensorr%   ne)rO   Z	pad_tokenr;   dummy_inputsr!   r!   r"   rY   g  s    (z PretrainedFSMTModel.dummy_inputsN)	__name__
__module____qualname__r   config_classbase_model_prefixrQ   propertyrY   r!   r!   r!   r"   rC   V  s
   rC   c                 C   s,   | j j\}}tj||dd}| j j|j _|S )NFrK   )rI   r'   r	   rH   rJ   )embZ
vocab_sizeZemb_sizeZ	lin_layerr!   r!   r"   _make_linear_from_embr  s    rb   c                 C   s    | |krt d|  d| d S )Nzshape mismatch: z != )r   )Zshape_1Zshape2r!   r!   r"   _check_shapesz  s    rc   c                 C   sx   |  | dk| |  }| |jddd d}| d| |dddf< | ddddf |ddddf< |S )zXShift input ids one token to the right, and wrap the last non pad token (usually <eos>).ir   r   r&   Nr   )Zmasked_fill_clonerX   sumr+   gatherZsqueeze)r;   r3   Zprev_output_tokensZindex_of_eosr!   r!   r"   r4     s    $r4   c                 C   s   |  |}| sd}|S )zTrue for pad tokensN)r   any)r;   rN   Zpadding_maskr!   r!   r"   r6     s    
r6   c                       s,   e Zd Zed fddZdddZ  ZS )EncoderLayerr:   c                    s   t    |j| _t| j|j|jd| _t| j| _	|j
| _
t|j | _|j| _t| j|j| _t|j| j| _t| j| _d S )N)dropout)super__init__d_model	embed_dim	AttentionZencoder_attention_headsattention_dropout	self_attnr   self_attn_layer_normrk   r   activation_functionactivation_fnactivation_dropoutr	   rH   Zencoder_ffn_dimfc1fc2final_layer_normrO   r:   	__class__r!   r"   rm     s    
zEncoderLayer.__init__Fc                 C   s   |}| j |||||d\}}tjj|| j| jd}|| }| |}|}| | |}tjj|| j| jd}| 	|}tjj|| j| jd}|| }| 
|}||fS )al  
        Args:
            x (`torch.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
            encoder_padding_mask (`torch.ByteTensor`): binary ByteTensor of shape
                *(batch, src_len)* where padding elements are indicated by `1`.
            for t_tgt, t_src is excluded (or masked out), =0 means it is
            included in attention
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                *(config.encoder_attention_heads,)*.

        Returns:
            encoded output of shape *(seq_len, batch, embed_dim)*
        )querykeykey_padding_masklayer_head_maskoutput_attentionsptraining)rr   r	   
functionalrk   r   rs   ru   rw   rv   rx   ry   )rO   r-   encoder_padding_maskr   r   residualattn_weightsr!   r!   r"   forward  s&    



zEncoderLayer.forward)FrZ   r[   r\   r   rm   r   __classcell__r!   r!   r{   r"   ri     s   ri   c                	       sR   e Zd ZdZed fddZdejeej ejeej e	e	e	dd	d
Z
  ZS )FSMTEncoderz
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a [`EncoderLayer`].

    Args:
        config: FSMTConfig
    rj   c                    s   t     j| _ j| _|j| _|| _|j} jr>t	
|nd| _t j| j d || j| _t fddt jD | _d S )N      ?r   c                    s   g | ]}t  qS r!   )ri   .0_rj   r!   r"   
<listcomp>  s     z(FSMTEncoder.__init__.<locals>.<listcomp>)rl   rm   rk   Zencoder_layerdrop	layerdroprN   embed_tokensembedding_dimscale_embeddingmathsqrtembed_scalerL   max_position_embeddingsembed_positionsr	   
ModuleListrangeZencoder_layerslayers)rO   r:   r   ro   r{   rj   r"   rm     s    
  zFSMTEncoder.__init__NFTr;   r    inputs_embeds	head_maskr   output_hidden_statesreturn_dictc                 C   s  |dk	rt |}|dk	r*|dk	r*tdn|dk	rN| || j }| |}n`|dk	r|| j }|dddddf |dddddf d| jj}	| |	}ntd|| }
tj	j
|
| j
| jd}
|
dd}
|rdnd}|rdnd}|dk	r6| d t| jks6tdt| j d	| d  d
t| jD ]\}}|rp|
dd}
||
f7 }|
dd}
tg }| jr|| jk rd}n&||
||dk	r|| nd|d\}
}|r@||f }q@|
dd}
|r||
f7 }|stdd |
||fD S t|
||dS )a  
        Args:
            input_ids (`torch.LongTensor`): tokens in the source language of shape
                *(batch, src_len)*
            attention_mask (`torch.LongTensor`): indicating which indices are padding tokens
            inputs_embeds (`torch.FloatTensor`):
                embedding vectors of shape *(batch, src_len, embed_dim)*
            head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

        Returns:
            BaseModelOutput or Tuple comprised of:

                - **x** (`torch.Tensor`): the last encoder layer's output of shape *(src_len, batch, embed_dim)*
                - **encoder_states** (`Tuple(torch.FloatTensor`)): all intermediate hidden states of shape *(src_len,
                  batch, embed_dim)*. Only populated if *output_hidden_states:* is True.
                - **all_attentions** (`Tuple(torch.FloatTensor`)): Attention weights for each layer.
                During training might not be of length n_layers because of layer dropout.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr   r   r!   z&The head_mask should be specified for  layers, but it is for .)r   r   c                 s   s   | ]}|d k	r|V  qd S Nr!   r   vr!   r!   r"   	<genexpr>=  s      z&FSMTEncoder.forward.<locals>.<genexpr>last_hidden_statehidden_states
attentions)r#   
ValueErrorr   r   r   r,   r   rN   r	   r   rk   r   	transposer5   lenr   r   	enumerater(   randr   tupler   )rO   r;   r    r   r   r   r   r   Z	embed_posposition_idsr-   Zencoder_statesZall_attentionsidxZencoder_layerdropout_probabilityZattnr!   r!   r"   r     s`    !

 





zFSMTEncoder.forward)NNNFFT)rZ   r[   r\   __doc__r   rm   r(   r   r   boolr   r   r!   r!   r{   r"   r     s"         r   c                       s,   e Zd Zed fddZdddZ  ZS )	DecoderLayerrj   c                    s   t    |j| _t| j|j|jd| _|j| _t	|j
 | _|j| _t| j| _t| j|j|jdd| _t| j| _t| j|j| _t|j| j| _t| j| _d S )N)ro   	num_headsrk   T)rk   encoder_decoder_attention)rl   rm   rn   ro   rp   Zdecoder_attention_headsrq   rr   rk   r   rt   ru   rv   r   rs   encoder_attnencoder_attn_layer_normr	   rH   Zdecoder_ffn_dimrw   rx   ry   rz   r{   r!   r"   rm   B  s*    
zDecoderLayer.__init__NFc
              	   C   s  |}
|d kri }| j |||||||	d\}}tjj|| j| jd}|
| }| |}|}
| jj| j jkslt| j||||||	d\}}tjj|| j| jd}|
| }| 	|}|}
| 
| |}tjj|| j| jd}| |}tjj|| j| jd}|
| }| |}||||fS )N)r}   r~   layer_stater   	attn_maskr   r   r   )r}   r~   r   r   r   r   )rr   r	   r   rk   r   rs   r   	cache_keyr   r   ru   rw   rv   rx   ry   )rO   r-   encoder_hidden_statesencoder_attn_maskr   rA   r   cross_attn_layer_head_maskr=   r   r   Zself_attn_weightsZcross_attn_weightsr!   r!   r"   r   [  sP    
	




zDecoderLayer.forward)NNNNNNFr   r!   r!   r{   r"   r   A  s          r   c                       sx   e Zd ZdZeejd fddZdej	ej	ej	ej	ej	e
ej	 e
ej	 e
ej	 e
eej  eeeedd	d
Z  ZS )FSMTDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DecoderLayer`]

    Args:
        config: FSMTConfig
        embed_tokens (nn.Embedding): output embedding
    )r:   r   c              	      s   t     j| _ j| _|j| _ jr4t j	nd| _
|| _|j}t j| j d || j| _t fddt jD | _t rdd l}|jj| jjd d | jjj}W 5 Q R X n
| jjj}tj|d |d dd| _| jj| j_d S )	Nr   r   c                    s   g | ]}t  qS r!   )r   r   rj   r!   r"   r     s     z(FSMTDecoder.__init__.<locals>.<listcomp>r   )Zmodifier_rankFr`   )rl   rm   rk   Zdecoder_layerdropr   rN   r   r   r   rn   r   r   r   rL   r   r   r	   r   r   Zdecoder_layersr   r   	deepspeedzeroZGatheredParametersrI   r'   rH   output_projection)rO   r:   r   ro   r   Zembed_tokens_weight_shaper{   rj   r"   rm     s,    
  
zFSMTDecoder.__init__NFT)r;   r   r   r=   decoder_causal_maskr   r   cross_attn_head_maskpast_key_values	use_cacher   r   r   c                 C   s  |dk	rt |}|dk	r*|dk	r*tdn|dk	rz| |}|
rh|ddddf }|ddddf }| || j }n`|dk	r|dddddf |dddddf d| jj}| |}|| j }ntd||7 }tj	j
|| j
| jd}|dd}|dd}|rdnd}|r(dnd}|r6dnd}g }t||gd	d
gD ]V\}}|dk	rP| d t| jksPtd| dt| j d| d  dqPt| jD ]\}}|r|dd}||f7 }|dd}| jrtg }|| jk rq|	dk	r|	| nd}||||||||dk	r:|| nd|dk	rN|| nd|d	\}}}}|
rt||  |r||f7 }||f7 }q|r|dd}||f7 }|dd}|dd}|dd}| |}|
r|nd}|stdd |||||fD S t|||||dS )a  
        Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al.,
        EMNLP 2019).

        Args:
            input_ids (`torch.LongTensor` of shape `(batch, tgt_len)`):
                previous decoder outputs for teacher forcing
            encoder_hidden_states: output from the encoder, used for
                encoder-side attention
            encoder_padding_mask: for ignoring pad tokens
            past_key_values (dict or None): dictionary used for storing state during generation
            head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

        Returns:
            BaseModelOutputWithPast or tuple:

                - the decoder's features of shape *(batch, tgt_len, embed_dim)*
                - the cache
                - hidden states
                - attentions
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer&   r   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsr   r   r!   r   r   zThe `z` should be specified for r   r   )r   r=   r   rA   r   r   r   c                 s   s   | ]}|d k	r|V  qd S r   r!   r   r!   r!   r"   r   E  s     z&FSMTDecoder.forward.<locals>.<genexpr>)r   r   r   r   cross_attentions)r#   r   r   r   r   r,   r   rN   r	   r   rk   r   r   zipr5   r   r   r   r   r(   r   r   appendcopyr   r   r   )rO   r;   r   r   r=   r   r   r   r   r   r   r   r   r   	positionsr-   r   Zall_hidden_statesZall_self_attnsZall_cross_attnsZnext_decoder_cacher   Z	mask_namer   Zdecoder_layerr   r   Zlayer_self_attn
layer_pastZlayer_cross_attnZ
next_cacher!   r!   r"   r     s    0

 

$




zFSMTDecoder.forward)NNNNFFFT)rZ   r[   r\   r   r   r	   rM   rm   r(   r   r   r   FloatTensorr   r   r   r!   r!   r{   r"   r     s2            r   c                 C   s.   |   D ] \}}|d k	r|d|| |< q| S )Nr   )itemsZindex_select)
attn_cacheZ	new_orderkZinput_buffer_kr!   r!   r"   _reorder_bufferQ  s    r   c                
       sx   e Zd ZdZd fdd	Zdd Zdee ee eee	ee f  ee ee e
eee f d
ddZdd Z  ZS )rp   z=Multi-headed attention from 'Attention Is All You Need' paperrE   TFc                    s   t    || _|| _|| _|| | _| j| | jks>td| jd | _|| _t	j
|||d| _t	j
|||d| _t	j
|||d| _t	j
|||d| _| jrdnd| _d S )Nz(embed_dim must be divisible by num_headsg      r`   Zencoder_decoderrO   )rl   rm   ro   r   rk   head_dimr   scalingr   r	   rH   k_projv_projq_projout_projr   )rO   ro   r   rk   rK   r   r{   r!   r"   rm   [  s    

zAttention.__init__c                 C   s"   |  ||| j | jddS )Nr   r   )
contiguousviewr   r   r   )rO   rW   seq_lenr?   r!   r!   r"   _shaper  s    zAttention._shapeN)r~   r   r   r   r   returnc                 C   s  | j }| \}	}
}|| jks"tt| |	|
|gks<t|dk	rd|| ji }d|krl|rld}nd}i }| || j }|r|dkrd }}q| 	|}| 
|}n| 	|}| 
|}| ||	|
}|dk	r| |d|
}|dk	r| |d|
}|dk	r| ||||||
\}}}||
| jd| j||
| jd| j|sH|ndd|| j< |dk	sdt|d}t||dd}| |
| j |	|fkst|dk	r||
| j|	|| }||
| j |	|}|dk	r| dkrd}|dks| dd |
|fkst|dk	rn||
| j|	|}|dd}||t|jj}||
| j |	|}tjj|dd}|dk	r| | jfkstd	| jf d
|  |dddd||
| j|	| }||
| j |	|}|r||
| j|	|}||
| j |	|}nd}tjj|| j| jd}|dk	sBtt||}| |
| j |	| jfksnt|dd |	|
|}| |}||fS )z+Input shape: Time(SeqLen) x Batch x ChannelNprev_keyr&   )r   
prev_valueprev_key_padding_maskr   r   r   rd   z/Head mask for a single layer should be of size z	, but is r   ) r   r5   ro   r   listgetr   r   r   r   r   r   _use_saved_stater   r   r   r(   Zbmmr   r   r+   r,   finfor2   minr	   r   Zsoftmaxrk   r   r   r   )rO   r}   r~   r   r   r   r   r   	static_kvr@   r?   ro   saved_stateqr   r   Zsrc_lenr   ZreshapedZattn_weights_reshapedZ
attn_probsZattn_outputr!   r!   r"   r   u  s    









" 
zAttention.forwardc                 C   s  d|krZ|d }|d k	st ||| j d| j}|r<|}n|d k	sHt tj||gdd}d|kr|d }	|	d k	svt |	|| j d| j}
|r|
}n|d k	st tj|
|gdd}|d k	r|d k	st |dd }|d k	r|r|}qtj||gdd}n|}|||fS )Nr   r&   r   rd   r   r   )r   r   r   r   r(   catr   )rO   r   r   r   r   r   r?   Z	_prev_keyr   Z_prev_valuer   r   Znew_key_padding_maskr!   r!   r"   r     s0    zAttention._use_saved_state)rE   TF)NNNNF)rZ   r[   r\   r   rm   r   r   r   r   strr   r   r   r   r!   r!   r{   r"   rp   X  s(           irp   c                 C   s   |   t| jj| S )z:FP16-compatible function that fills a input_ids with -inf.)floatZfill_r(   r   r2   r   type_astr!   r!   r"   r7     s    r7   c                 C   s   t | dd S )Nr'   )getattrr   r!   r!   r"   
_get_shape  s    r   zRThe bare FSMT Model outputting raw hidden-states without any specific head on top.c                       s
  e Zd ZddgZed fddZdd Zdd	 Zd
d Ze	e
eeeeddejeej eej eej eej eej eej eeej  eeej  ee ee ee eej eej ee eeej ef dddZdd Zdd Zdd Zdd Z  ZS )	FSMTModeldecoder.embed_tokens.weight decoder.output_projection.weightrj   c                    sZ   t  | |j}t|j|j|}t|j|j|}t||| _	t
||| _|   d S r   )rl   rm   r3   r	   rM   Zsrc_vocab_sizern   tgt_vocab_sizer   encoderr   decoder	post_init)rO   r:   rN   Zencoder_embed_tokensZdecoder_embed_tokensr{   r!   r"   rm     s    zFSMTModel.__init__c                 C   s   | j S r   )r   rO   r!   r!   r"   get_encoder  s    zFSMTModel.get_encoderc                 C   s   | j S r   )r   r   r!   r!   r"   get_decoder  s    zFSMTModel.get_decoderc                 C   s4   | j jr0| | jj|   | | jj|   d S r   )r:   Ztie_word_embeddingsZ_tie_or_clone_weightsr   r   get_input_embeddingsr   r   r!   r!   r"   _tie_weights"  s    zFSMTModel._tie_weights)
checkpointoutput_typer]   N)r;   r    r<   decoder_attention_maskr   decoder_head_maskr   encoder_outputsr   r   r   r   r   decoder_inputs_embedsr   r   c                 C   sz  |d krd}
|d k	r|n| j j}|d k	r,|n| j j}|
d k	r@|
n| j j}
|d k	rT|n| j j}|
s|d k	rt| j |||| jjjj	d\}}}nd\}}|d kr|d krt
d|d kr| j|||||||d}nN|rt|tst|d t|dkr|d nd t|dkr|d nd d	}| j||d |||||||	|
|||d
}|sR|| S t|j|j|j|j|j|j|j|jdS )NF)r<   r=   r>   )NNzIMake sure that `decoder_input_ids` or `decoder_inputs_embeds` are passed.r   r   r   r   r   )	r   r   r   r   r   r   r   r   r   )r   r   decoder_hidden_statesdecoder_attentionsr   encoder_last_hidden_stater   encoder_attentions)r:   r   r   r   use_return_dictrB   r   r   rI   r2   r   r   rG   r   r   r   r   r   r   r   r   )rO   r;   r    r<   r  r   r  r   r  r   r   r   r   r   r  r   r=   rA   Zdecoder_outputsr!   r!   r"   r   '  sz    

zFSMTModel.forwardc                 C   s   | j jS r   r   r   r   r!   r!   r"   r     s    zFSMTModel.get_input_embeddingsc                 C   s   || j _d S r   r  rO   valuer!   r!   r"   set_input_embeddings  s    zFSMTModel.set_input_embeddingsc                 C   s   | j jS r   r   r   r   r!   r!   r"   get_output_embeddings  s    zFSMTModel.get_output_embeddingsc                 C   s   || j _d S r   r  r  r!   r!   r"   set_output_embeddings  s    zFSMTModel.set_output_embeddings)NNNNNNNNNNNNNN)rZ   r[   r\   _tied_weights_keysr   rm   r   r   r   r   FSMT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr(   
LongTensorr   r   
BoolTensorr   r   r   r   r   r   r  r  r  r   r!   r!   r{   r"   r     s^                 \r   zLThe FSMT Model with a language modeling head. Can be used for summarization.c                       s*  e Zd ZdZddgZed fddZeee	e
edeedejeej eej eej eej eej eej eeej  eeej  eej eej eej ee ee ee ee eeej e
f d	d
dZdddZejdddZedd Zdd Zdd Zdd Zdd Z  Z S )FSMTForConditionalGenerationrD   r   r   rj   c                    s&   t  | t|}|| _|   d S r   )rl   rm   r   rD   r   )rO   r:   Z
base_modelr{   r!   r"   rm     s    z%FSMTForConditionalGeneration.__init__)r  r]   N)r;   r    r<   r  r   r  r   r  r   r   r  labelsr   r   r   r   r   c                 C   s   |dk	r|n| j j}|dk	r d}| j||
|||||||||	||||d}|d }d}|dk	r~t }||d| j j|d}|s|f|dd  }|dk	r|f| S |S t|||j|j|j	|j
|j|j|jd	S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        NF)r   r    r<   r  r  r  r   r  r   r   r   r   r   r   r   r&   r   )	ZlossZlogitsr   r  r  r   r  r   r	  )r:   r
  rD   r
   r   r   r   r   r  r  r   r  r   r	  )rO   r;   r    r<   r  r   r  r   r  r   r   r  r  r   r   r   r   outputsZ	lm_logitsZmasked_lm_lossZloss_fctoutputr!   r!   r"   r     sN    z$FSMTForConditionalGeneration.forwardc	           
   
   K   s   d ||||||||d	S )N)	r;   r  r   r<   r    r   r  r   r   r!   )
rO   r<   r   r    r   r  r   r   r  kwargsr!   r!   r"   prepare_inputs_for_generation  s    z:FSMTForConditionalGeneration.prepare_inputs_for_generation)r  c                 C   s   t || jjS r   )r4   r:   r3   )rO   r  r!   r!   r"   %prepare_decoder_input_ids_from_labels
  s    zBFSMTForConditionalGeneration.prepare_decoder_input_ids_from_labelsc                    s2   g }| D ]$} fdd|  D }|| q|S )Nc                    s   i | ]\}}|t | qS r!   )r   )r   Zattn_keyr   beam_idxr!   r"   
<dictcomp>  s     z?FSMTForConditionalGeneration._reorder_cache.<locals>.<dictcomp>)r   r   )r   r   Zreordered_pastr   Zlayer_past_newr!   r  r"   _reorder_cache  s    
z+FSMTForConditionalGeneration._reorder_cachec                 C   s   | j jS r   )rD   r   r   r!   r!   r"   r     s    z(FSMTForConditionalGeneration.get_encoderc                 C   s   | j jS r   )rD   r   r   r!   r!   r"   r     s    z(FSMTForConditionalGeneration.get_decoderc                 C   s
   | j jjS r   rD   r   r   r   r!   r!   r"   r    s    z2FSMTForConditionalGeneration.get_output_embeddingsc                 C   s   || j j_d S r   r#  r  r!   r!   r"   r  !  s    z2FSMTForConditionalGeneration.set_output_embeddings)NNNNNNNNNNNNNNN)NNNNNNN)!rZ   r[   r\   r^   r  r   rm   r   r  r   r   r  r   FSMT_GENERATION_EXAMPLEr(   r  r   r   r  r   r   r   r   r   r  r  staticmethodr"  r   r   r  r  r   r!   r!   r{   r"   r    sn   
               M       


r  c                       sd   e Zd ZdZdd Z fddZedd Zeedd	d
Z	de
e e
e d fddZ  ZS )rL   a<  
    This module produces sinusoidal positional embeddings of any length.

    We don't want to save the weight of this embedding since it's not trained (deterministic) and it can be huge.

    Padding symbols are ignored.

    These embeddings get automatically extended in forward if more positions is needed.
    c                 C   s   |  ||| d S r   )make_weight)rO   num_positionsr   rN   r!   r!   r"   rm   0  s    z&SinusoidalPositionalEmbedding.__init__c                    sf   |  |||}t| ds.t j||||d n"|j| jj| jjd}t	|| _| j
  d| j_d S )NrI   )Z_weight)r2   r%   F)get_embeddinghasattrrl   rm   r9   rI   r2   r%   r	   	ParameterZdetach_Zrequires_grad)rO   r'  r   rN   rI   r{   r!   r"   r&  3  s    

z)SinusoidalPositionalEmbedding.make_weightc                 C   s   |d }t d|d  }ttj|tjd|  }tj| tjdd|d }tjt|t	|gdd
| d}|d dkrtj|t| dgdd}|dk	rd||ddf< |S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        r   i'  r   r1   r   rd   r&   N)r   logr(   expr)   r   r+   r   sincosr   r8   )Znum_embeddingsr   rN   Zhalf_dimra   r!   r!   r"   r(  ?  s     &z+SinusoidalPositionalEmbedding.get_embedding)rN   c                 C   s.   |  | }tj|dd||  | S )z
        Replace non-padding symbols with their position numbers.

        Position numbers begin at padding_idx+1. Padding symbols are ignored.
        r   rd   )rX   intr(   Zcumsumr   long)rW   rN   r/   r!   r!   r"   make_positionsS  s    z,SinusoidalPositionalEmbedding.make_positionsN)incremental_statetimestepc                    s\   |j dd \}}| jd | }|| jdkrB| || j| j | || j}t |S )z/Input is expected to be of size [bsz x seqlen].Nr   r   r   )	r'   rN   rI   r5   r&  r   r1  rl   r   )rO   inputr2  r3  r?   r   Zmax_posr   r{   r!   r"   r   a  s    z%SinusoidalPositionalEmbedding.forward)NN)rZ   r[   r\   r   rm   r&  r%  r(  r/  r1  r   r   r   r   r   r!   r!   r{   r"   rL   %  s   

  rL   )r   )r   )Ar   r   typingr   r   r   r   r   r   r(   r   r	   Ztorch.nnr
   r   Zactivationsr   Zintegrations.deepspeedr   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   utilsr   r   r   r   r   r   Zconfiguration_fsmtr   Z
get_loggerrZ   loggerr  r  ZFSMT_START_DOCSTRINGr$  r  r#   r0   Zfloat32rB   rC   rb   rc   r4   r6   Moduleri   r   r   r   r   rp   r7   r   r   r  rM   rL   r!   r!   r!   r"   <module>   sf     
yS


1yX 9 '   