U
    9%e?                    @   s.  d Z ddlZddlZddlmZmZmZmZ ddlZddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$ e!%e&Z'dZ(dZ)G dd de	j*Z+G dd de	j*Z,G dd de	j*Z-G dd de	j*Z.G dd de	j*Z/G dd de	j*Z0G dd de	j*Z1G dd de	j*Z2G d d! d!e	j*Z3G d"d# d#eZ4G d$d% d%e4Z5d&Z6d'Z7d(Z8ed)e6G d*d+ d+e4Z9ed,e6G d-d. d.e4Z:ed/e6G d0d1 d1e4Z;ed2e6G d3d4 d4e4Z<ed5e6G d6d7 d7e4Z=dS )8z PyTorch UMT5 model.    N)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss)
checkpoint   )ACT2FN)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKadd_start_docstrings%add_start_docstrings_to_model_forwardis_torch_fx_proxyloggingreplace_return_docstrings   )
UMT5Configr   zgoogle/umt5-smallc                       s&   e Zd Zd fdd	Zdd Z  ZS )UMT5LayerNormư>c                    s&   t    tt|| _|| _dS )ze
        Construct a layernorm module in the UMT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ e/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/umt5/modeling_umt5.pyr    8   s    
zUMT5LayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fkrR| | jj}| j| S )N   T)Zkeepdim)tor"   Zfloat32powmeanZrsqrtr%   r$   dtypefloat16Zbfloat16)r&   hidden_statesZvariancer+   r+   r,   forward@   s
    zUMT5LayerNorm.forward)r   __name__
__module____qualname__r    r5   __classcell__r+   r+   r)   r,   r   7   s   r   c                       s*   e Zd Zed fddZdd Z  ZS )UMT5DenseActDenseconfigc                    sT   t    tj|j|jdd| _tj|j|jdd| _t|j	| _
t|j | _d S NFbias)r   r    r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr   dense_act_fnactr&   r=   r)   r+   r,   r    R   s
    
zUMT5DenseActDense.__init__c                 C   sl   |  |}| |}| |}t| jjtjr^|j| jjjkr^| jjjtj	kr^|
| jjj}| |}|S N)rD   rJ   rH   
isinstancerE   r$   r"   Tensorr2   int8r/   r&   r4   r+   r+   r,   r5   Y   s    



zUMT5DenseActDense.forwardr7   r8   r9   r   r    r5   r:   r+   r+   r)   r,   r;   Q   s   r;   c                       s*   e Zd Zed fddZdd Z  ZS )UMT5DenseGatedActDenser<   c                    sj   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S r>   )r   r    r   rA   rB   rC   wi_0wi_1rE   rF   rG   rH   r   rI   rJ   rK   r)   r+   r,   r    i   s    
zUMT5DenseGatedActDense.__init__c                 C   sz   |  | |}| |}|| }| |}t| jjtjrl|j	| jjj	krl| jjj	tj
krl|| jjj	}| |}|S rL   )rJ   rS   rT   rH   rM   rE   r$   r"   rN   r2   rO   r/   )r&   r4   Zhidden_geluZhidden_linearr+   r+   r,   r5   q   s    


zUMT5DenseGatedActDense.forwardrQ   r+   r+   r)   r,   rR   h   s   rR   c                       s*   e Zd Zed fddZdd Z  ZS )UMT5LayerFFr<   c                    sJ   t    |jrt|| _n
t|| _t|j|jd| _	t
|j| _d S )Nr(   )r   r    Zis_gated_actrR   DenseReluDenser;   r   rB   layer_norm_epsilon
layer_normr   rF   rG   rH   rK   r)   r+   r,   r       s    

zUMT5LayerFF.__init__c                 C   s&   |  |}| |}|| | }|S rL   )rY   rW   rH   )r&   r4   Zforwarded_statesr+   r+   r,   r5      s    

zUMT5LayerFF.forwardrQ   r+   r+   r)   r,   rU      s   
rU   c                       s~   e Zd ZdZd fdd	ZejejdddZdd	 ZdddZ	deje
ej e
eej  e
ej e
ej dddZ  ZS )UMT5Attentionz7
    T5's attention using relative_attention_bias.
    Fc                    s   t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| j
| j | _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrt| j| j
| _t | _d S r>   )r   r    
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerB   d_kvkey_value_proj_dim	num_headsn_headsrG   rH   Z	inner_dimr   rA   qkvo	Embeddingrelative_attention_biassetZpruned_heads)r&   r=   r\   r)   r+   r,   r       s"    
zUMT5Attention.__init__)
projectionreturnc                 C   s6   |  d d | j| jf }||dddd}|S )Nr.   r   r-   r   r   )sizerb   r`   viewpermute)r&   rj   Znew_projection_shapeZnew_projectionr+   r+   r,   _shape   s    zUMT5Attention._shapec           	      C   s   d}| j }| j}| jsB|d }||dktj| 7 }t|}nt|t| }|d }||k }t	|
 | t	||  }|||  }||tj }t|t||d }|t|||7 }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r-   r   )r]   r^   r[   r/   r"   longabsminZ
zeros_likelogfloatmathZ	full_likewhere)	r&   relative_positionZrelative_bucketsZnum_bucketsZmax_distanceZ	max_exactZis_smallZ	log_ratioZrelative_position_if_larger+   r+   r,   _relative_position_bucket   s&      z'UMT5Attention._relative_position_bucketNc           	      C   s   |dkr| j jj}tj|tj|ddddf }tj|tj|ddddf }|| }| |}|  |}|dddgd}|S )z%Compute binned relative position biasN)r2   devicer-   r   r   )	rh   r$   ry   r"   Zarangerp   rx   rn   Z	unsqueeze)	r&   query_lengthZ
key_lengthry   Zcontext_positionZmemory_positionrw   Zrelative_position_bucketvaluesr+   r+   r,   compute_bias   s    


zUMT5Attention.compute_bias)r4   encoder_hidden_statespast_key_valueattention_masklayer_head_maskc                 C   s  |d k	}|j d d \}}|d k	r&|n|}	|r\|r\|d j d |	j d kr\|d }
|d }nX| | |	}
| | |	}|d k	r|stj|d |
gdd}
tj|d |gdd}| | |}t||
dd}| j	r|}|d k	r||d j d 7 }| j
||
d|jd}n(tjd| j||
df|j|j| jd}|d k	rr|d d d d |d d d d f }|d k	r|| }| jr|
|f}||7 }tjj| dd|}tjj|| j| jd	}|d k	r|| }t||}|dddd
 ||d}| |}|||fS )Nr-   r   r   dimr.   ry   )ry   r2   Zrequires_grad)ptrainingr   )shapero   rd   re   r"   catrc   matmulZ	transposer\   r|   rl   ry   Zzerosrb   r2   r   r[   r   Z
functionalZsoftmaxrt   Ztype_asrH   rn   
contiguousrm   rf   )r&   r4   r}   r~   r   r   Zis_cross_attention
batch_size
seq_lengthZcurrent_statesZ
key_statesZvalue_statesZquery_statesZattention_scoresrz   Zposition_biasZattn_weightsZcontext_statesZattn_outputr+   r+   r,   r5      sN     

(


zUMT5Attention.forward)F)N)NNNN)r7   r8   r9   __doc__r    r"   rN   ro   rx   r|   r   r   r5   r:   r+   r+   r)   r,   rZ      s    /
    rZ   c                       s&   e Zd Z fddZdddZ  ZS )UMT5LayerSelfAttentionc                    s<   t    t|dd| _t|j|jd| _t	|j
| _d S )NTr\   rV   )r   r    rZ   SelfAttentionr   rB   rX   rY   r   rF   rG   rH   rK   r)   r+   r,   r    @  s    
zUMT5LayerSelfAttention.__init__Nc                 C   sD   |  |}| j||||d}|| |d  }|f|dd   }|S )Nr   r   r~   r   r   )rY   r   rH   )r&   r4   r   r   r~   normed_hidden_statesattention_outputoutputsr+   r+   r,   r5   F  s    
zUMT5LayerSelfAttention.forward)NNNr6   r+   r+   r)   r,   r   ?  s
   	   r   c                       s&   e Zd Z fddZdddZ  ZS )UMT5LayerCrossAttentionc                    s<   t    t|dd| _t|j|jd| _t	|j
| _d S )NFr   rV   )r   r    rZ   EncDecAttentionr   rB   rX   rY   r   rF   rG   rH   rK   r)   r+   r,   r    Z  s    
z UMT5LayerCrossAttention.__init__Nc           
      C   sF   |  |}| j|||||d}|| |d  }|f|dd   }	|	S )Nr}   r   r   r~   r   r   )rY   r   rH   )
r&   r4   r}   r   r   r~   r   r   Zlayer_outputr   r+   r+   r,   r5   `  s    
zUMT5LayerCrossAttention.forward)NNNNr6   r+   r+   r)   r,   r   Y  s   	    r   c                       s&   e Zd Z fddZdddZ  ZS )	UMT5Blockc                    sV   t    |j| _t | _| jt| | jrB| jt| | jt	| d S rL   )
r   r    r[   r   
ModuleListlayerappendr   r   rU   rK   r)   r+   r,   r    v  s    

zUMT5Block.__init__NFc
                 C   s  |d k	r|d d nd }
| j d ||||
d\}}}|jtjkr|t|jj}tt| |d |}tj	|| |d}d }d }| j
o|d k	}|r |d k	r|dd  nd }| j d |||||d\}}}|jtjkrt|jj}tt| |d |}tj	|| |d}||7 }| j d	 |}|jtjkrxt|jj}tt| |d |}tj	|| |d}||f}|	r|||f7 }|S )
Nr-   r   r   i  )rr   maxr   r   r   r.   )r   r2   r"   r3   Zfinfor   rv   isinfanyclampr[   )r&   r4   r   r}   encoder_attention_maskr   cross_attn_layer_head_maskr~   	use_cacheoutput_attentionsZself_attn_past_key_valueZself_attn_weightsZpresent_key_valueZ	max_dtypeZclamp_valueZcross_attn_present_key_valueZcross_attn_weightsZdo_cross_attentionZcross_attn_past_key_valuer   r+   r+   r,   r5     sN    zUMT5Block.forward)NNNNNNFFr6   r+   r+   r)   r,   r   u  s           r   c                       s:   e Zd ZdZed fddZejejdddZ  Z	S )UMT5ClassificationHeadz-Head for sentence-level classification tasks.r<   c                    sB   t    t|j|j| _tj|jd| _t|j|j	| _
d S )N)r   )r   r    r   rA   rB   denserF   Zclassifier_dropoutrH   
num_labelsout_projrK   r)   r+   r,   r      s    
zUMT5ClassificationHead.__init__)r4   rk   c                 C   s6   |  |}| |}t|}|  |}| |}|S rL   )rH   r   r"   tanhr   rP   r+   r+   r,   r5     s    




zUMT5ClassificationHead.forward)
r7   r8   r9   r   r   r    r"   rN   r5   r:   r+   r+   r)   r,   r     s   r   c                   @   sN   e Zd ZdZeZdZdZdgZdgZ	e
dd Zdd	 ZdddZdd ZdS )UMT5PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    transformerTr   rE   c                 C   s$   t t}t t}|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r"   Ztensorr   r   )r&   r   Z
input_maskdummy_inputsr+   r+   r,   r     s    

z UMT5PreTrainedModel.dummy_inputsc                 C   s  | j j}t|tr(|jj|d  nt|ttt	t
fr|jjjjd|d d t|dr|| j js||jjjjd|d d t|dr|jjjjd|| j jd  d |jjj  nt|tr\|jjjjd|| j jd  d t|jdr|jjdk	r|jjj  |jjjjd|| j jd  d t|jdr|jjdk	r|jjj  nPt|tr |jjjjd|| j jd  d t|jdr|jjdk	r|jjj  |jjjjd|| j jd  d t|jdr|jjdk	r|jjj  nt|tr|jjjjd|| j jd  d t|jdrV|jjdk	rV|jjj  |jjjjd|| j jd  d t|jdr|jjdk	r|jjj  |jjjjd|| j jd  d t|jdr|jjdk	r|jjj  nt|tr| j j}| j j}| j j }|j!jjjd||| d  d |j"jjjd||d  d |j#jjjd||d  d |j$jjjd||| d  d |j%r|j&jjjd||d  d dS )	zInitialize the weightsg      ?g        )r1   Zstdlm_head
qa_outputs      r@   N)'r=   Zinitializer_factorrM   r   r$   dataZfill_	UMT5ModelUMT5ForConditionalGenerationUMT5EncoderModelUMT5ForQuestionAnsweringsharedZnormal_hasattrtie_word_embeddingsr   r   rB   r@   Zzero_r   r   r   r;   rD   rE   rC   rR   rS   rT   rZ   r_   ra   rc   rd   re   rf   r\   rh   )r&   modulefactorrB   r`   rb   r+   r+   r,   _init_weights  sf    

          z!UMT5PreTrainedModel._init_weightsFc                 C   s   t |ttfr||_d S rL   )rM   rZ   	UMT5Stackgradient_checkpointing)r&   r   valuer+   r+   r,   _set_gradient_checkpointing/  s    z/UMT5PreTrainedModel._set_gradient_checkpointingc                 C   s   | j j}| j j}|d kr tdt|rbt|jd d d |}tj||dd df gdd}n4|	|j}|dd df 
 |ddd f< ||d< |d krtd||d	k| |S )
Nzself.model.config.decoder_start_token_id has to be defined. In UMT5 it is usually set to the pad_token_id.See UMT5 docs for more information.r.   )r   .r   r   ).r   z1self.model.config.pad_token_id has to be defined.)r=   decoder_start_token_idpad_token_id
ValueErrorr   r"   fullr   r   Z	new_zeroscloneZmasked_fill_)r&   r   r   r   Zshifted_input_idsr+   r+   r,   _shift_right3  s       z UMT5PreTrainedModel._shift_rightN)F)r7   r8   r9   r   r   config_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_keep_in_fp32_modulespropertyr   r   r   r   r+   r+   r+   r,   r     s   

>
r   c                       s8   e Zd Zd
 fdd	Zdd Zdd Zddd	Z  ZS )r   Nc                    sl   t    || _ j| _t fddt jD | _t	 j
 jd| _t j| _d| _|   d S )Nc                    s   g | ]}t  qS r+   )r   ).0ir<   r+   r,   
<listcomp>T  s     z&UMT5Stack.__init__.<locals>.<listcomp>rV   F)r   r    embed_tokensr[   r   r   range
num_layersblockr   rB   rX   final_layer_normrF   rG   rH   r   	post_init)r&   r=   r   r)   r<   r,   r    P  s     zUMT5Stack.__init__c                 C   s   | j S rL   r   r&   r+   r+   r,   get_input_embeddings\  s    zUMT5Stack.get_input_embeddingsc                 C   s
   || _ d S rL   r   r&   Znew_embeddingsr+   r+   r,   set_input_embeddings_  s    zUMT5Stack.set_input_embeddingsc           %         s  d k	rn| j j d k	r  n| j j |d k	r4|n| j j}|d k	rH|n| j j}|d k	r|d k	r| jrjdnd}td| d| dn`|d k	r| }|d|d }n>|d k	r| d d }n$| jrdnd}td| d| d	|d kr| j	d krtd
| 	|}|\}}|d k	r6|d d j
d | n|}dkr\| js\td|  d|d krxtj|||jd}| jr|d kr|d k	r|j
d }tj|||jtjd}|d krd gt| j }| ||}| jr(|d k	r(| \}}}||f}|d krtj||jd}| |}nd }| jrP| jrPrPtd d| || j j}| || j j}rzdnd }|rdnd } rdnd } r| jrdnd }| |}tt| j|D ]\}\}} || }!|| }"|r||f }| jr0| jr0 fdd}#t|#||||||!|"d }$n"||||||!|"|  d	}$|$d }rf||$d f7 } r||$d f7 }| jr||$d f7 }q| |}| |}|r||f }|stdd |||||fD S t|||||dS )NZdecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer.   zYou have to specify either zinput_ids or inputs_embedsz<You have to initialize the model with valid token embeddingsr   r-   Tz)`use_cache` can only be set to `True` if z is used as a decoderr   r   )ry   r2   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr+   c                    s    fdd}|S )Nc                     s   t  | f S rL   tuple)inputs)r   r   r   r+   r,   custom_forward  s    zHUMT5Stack.forward.<locals>.create_custom_forward.<locals>.custom_forwardr+   )r   r   r   r   )r   r,   create_custom_forward  s    z0UMT5Stack.forward.<locals>.create_custom_forward)r   r}   r   r   r   r~   r   r   r   c                 s   s   | ]}|d k	r|V  qd S rL   r+   )r   re   r+   r+   r,   	<genexpr>  s   z$UMT5Stack.forward.<locals>.<genexpr>)last_hidden_statepast_key_valuesr4   
attentionscross_attentions) r=   r   r   output_hidden_statesuse_return_dictr[   r   rl   rm   r   r   r"   r#   ry   rp   lenr   Zget_extended_attention_maskZinvert_attention_maskr   r   loggerZwarning_onceZget_head_maskr   rH   	enumeratezipr
   r   r   r   )%r&   r   r   r}   r   r   	head_maskcross_attn_head_maskr   r   r   r   return_dictZerr_msg_prefixZinput_shaper   r   Zmask_seq_lengthZencoder_seq_lengthZextended_attention_maskZencoder_batch_sizeZencoder_sequence_length_Zencoder_hidden_shapeZencoder_extended_attention_maskZpresent_key_value_statesZall_hidden_statesZall_attentionsZall_cross_attentionsr4   r   Zlayer_moduler~   r   r   r   Zlayer_outputsr+   r   r,   r5   b  s    

$


   






zUMT5Stack.forward)N)NNNNNNNNNNNN)r7   r8   r9   r    r   r   r5   r:   r+   r+   r)   r,   r   O  s                r   a  

    The UMT5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text
    Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan
    Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a
    text-to-text denoising generative setting.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`UMT5Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
ah  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
                `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
            input (see `past_key_values`). This is useful if you want more control over how to convert
            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.

            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
            of `inputs_embeds`.

        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z^The bare UMT5 Model transformer outputting raw hidden-states without any specific head on top.c                       s
  e Zd ZdZdZeZddgZ fddZdd Z	d	d
 Z
dd Zdd Zdd Zeeeeeddeej eej eej eej eej eej eej eeeej   eeeej   eej eej ee ee ee ee eeej ef dddZ  ZS )r   ao  
    Examples:

    ```python
    >>> from transformers import UMT5Model, AutoTokenizer

    >>> model = UMT5Model.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> noisy_text = "UN Offizier sagt, dass weiter <extra_id_0> werden muss in Syrien."
    >>> label = "<extra_id_0> verhandelt"
    >>> inputs = tokenizer(inputs, return_tensors="pt")
    >>> labels = tokenizer(label=label, return_tensors="pt")

    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
    >>> hidden_states = outputs.last_hidden_state
    ```Zuumt5encoder.embed_tokens.weightdecoder.embed_tokens.weightc                    s   t  | t|j|j| _t|}d|_	d|_
d|_t|| j| _t|}d|_	d|_|j|_t|| j| _|   d S NFT)r   r    r   rg   
vocab_sizerB   r   copydeepcopyr[   r   is_encoder_decoderr   encodernum_decoder_layersr   decoderr   r&   r=   encoder_configZdecoder_configr)   r+   r,   r      s    

zUMT5Model.__init__c                 C   s   | j S rL   r   r   r+   r+   r,   r     s    zUMT5Model.get_input_embeddingsc                 C   s"   || _ | j| | j| d S rL   r   r   r   r   r   r+   r+   r,   r     s    zUMT5Model.set_input_embeddingsc                 C   s   | j S rL   r   r   r+   r+   r,   get_encoder  s    zUMT5Model.get_encoderc                 C   s   | j S rL   r   r   r+   r+   r,   get_decoder  s    zUMT5Model.get_decoderc                 C   s*   |  D ]\}}| jj| j| qdS )
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   Z	attentionprune_headsr&   Zheads_to_pruner   Zheadsr+   r+   r,   _prune_heads  s    zUMT5Model._prune_headsoutput_typer   N)r   r   r   r   r   decoder_head_maskr   encoder_outputsr   r   decoder_inputs_embedsr   r   r   r   rk   c                 C   s   |dk	r|n| j j}|dk	r |n| j j}|dkrJ| j|||
||||d}nH|rt|tst|d t|dkrt|d ndt|dkr|d ndd}|d }| j||||	||||||||d}|s|| S t|j	|j
|j|j|j|j	|j|jdS )	a  
        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5Model.from_pretrained("google/umt5-small")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for UMT5Model.
        >>> # This is not needed for torch's UMT5ForConditionalGeneration as it does this internally using labels arg.
        >>> decoder_input_ids = model._shift_right(decoder_input_ids)

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr   r   r   r   r   r   r   r   r   r-   r   r4   r   r   r   r   r   r}   r   r   r   r   r   r   r   )r   r   decoder_hidden_statesdecoder_attentionsr   encoder_last_hidden_stater}   encoder_attentions)r=   r   r   r   rM   r   r   r   r   r   r   r4   r   r   )r&   r   r   r   r   r   r  r   r	  r   r   r
  r   r   r   r   r4   decoder_outputsr+   r+   r,   r5     sZ    ,	zUMT5Model.forward)NNNNNNNNNNNNNNN)r7   r8   r9   r   
model_typer   r   _tied_weights_keysr    r   r   r   r   r  r   UMT5_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr   r"   
LongTensorFloatTensor
BoolTensorrN   r   boolr   r5   r:   r+   r+   r)   r,   r     sZ   
               r   z2UMT5 Model with a `language modeling` head on top.c                       s>  e Zd ZdZdZdddgZ fddZdd	 Zd
d Zdd Z	dd Z
dd Zdd Zeeeeedd eej eej eej eej eej eej eej eeeej   eeeej   eej eej eej ee ee ee ee eeej ef dddZd!ddZejdddZedd Z  Z S )"r   a  
    Examples:

    ```python
    >>> from transformers import UMT5ForConditionalGeneration, AutoTokenizer

    >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")

    >>> outputs = model(**inputs)
    >>> loss = outputs.loss
    ```umt5r   r   zlm_head.weightc                    s   t  | |j| _t|j|j| _t	|}d|_
d|_d|_t|| j| _t	|}d|_
d|_|j|_t|| j| _tj|j|jdd| _|   d S )NFTr?   )r   r    rB   	model_dimr   rg   r   r   r   r   r[   r   r   r   r   r   r   r   rA   r   r   r   r)   r+   r,   r    b  s    

z%UMT5ForConditionalGeneration.__init__c                 C   s   | j S rL   r   r   r+   r+   r,   r   z  s    z1UMT5ForConditionalGeneration.get_input_embeddingsc                 C   s"   || _ | j| | j| d S rL   r   r   r+   r+   r,   r   ~  s    z1UMT5ForConditionalGeneration.set_input_embeddingsc                 C   s
   || _ d S rL   r   r   r+   r+   r,   set_output_embeddings  s    z2UMT5ForConditionalGeneration.set_output_embeddingsc                 C   s   | j S rL   r  r   r+   r+   r,   get_output_embeddings  s    z2UMT5ForConditionalGeneration.get_output_embeddingsc                 C   s   | j S rL   r   r   r+   r+   r,   r     s    z(UMT5ForConditionalGeneration.get_encoderc                 C   s   | j S rL   r   r   r+   r+   r,   r     s    z(UMT5ForConditionalGeneration.get_decoderr  N)r   r   r   r   r   r  r   r	  r   r   r
  labelsr   r   r   r   rk   c                 C   s  |dk	r|n| j j}|dk	r |n| j j}|dkrJ| j|||
||||d}nH|rt|tst|d t|dkrt|d ndt|dkr|d ndd}|d }|dk	r|dkr|dkr| |}| j||||	||||||||d}|d }| j j	r|| j
d  }| |}d}|dk	rHtd	d
}||j}||d|d|d}|s||f|dd  | }|dk	rx|f| S |S t|||j|j|j|j|j|j|jd	S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, UMT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")

        >>> # training
        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
        >>> outputs = model(input_ids=input_ids, labels=labels)
        >>> loss = outputs.loss
        >>> logits = outputs.logits

        >>> # inference
        >>> input_ids = tokenizer("Studies have shown that <extra_id_0> good for you", return_tensors="pt").input_ids
        >>> outputs = model.generate(input_ids)
        >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
        ```Nr  r   r   r-   r  r  r   r   Zignore_indexr.   	losslogitsr   r  r  r   r  r}   r  )r=   r   r   r   rM   r   r   r   r   r   r  r   r   r/   ry   rm   rl   r   r   r4   r   r   r   )r&   r   r   r   r   r   r  r   r	  r   r   r
  r   r   r   r   r   r4   r  sequence_outputZ	lm_logitsr#  loss_fctoutputr+   r+   r,   r5     st    1	



z$UMT5ForConditionalGeneration.forwardc
              
   K   s4   |d k	r|d d dd f }|||	||||||d	S )Nr.   )	r   r   r	  r   r   r  r   r   r   r+   )r&   r   r   r   r   r  r   r   r   r	  kwargsr+   r+   r,   prepare_inputs_for_generation  s    z:UMT5ForConditionalGeneration.prepare_inputs_for_generation)r   c                 C   s
   |  |S rL   )r   )r&   r   r+   r+   r,   %prepare_decoder_input_ids_from_labels/  s    zBUMT5ForConditionalGeneration.prepare_decoder_input_ids_from_labelsc                    s.   d}| D ] }|t  fdd|D f7 }q|S )Nr+   c                 3   s"   | ]}| d  |jV  qdS )r   N)Zindex_selectr/   ry   )r   Z
past_statebeam_idxr+   r,   r   7  s     z>UMT5ForConditionalGeneration._reorder_cache.<locals>.<genexpr>r   )r   r,  Zreordered_pastZ
layer_pastr+   r+  r,   _reorder_cache2  s    z+UMT5ForConditionalGeneration._reorder_cache)NNNNNNNNNNNNNNNN)NNNNNNNN)!r7   r8   r9   r   r  r  r    r   r   r  r  r   r   r   r  r   r   r  r   r"   r  r  r  rN   r   r  r   r5   r)  r*  staticmethodr-  r:   r+   r+   r)   r,   r   M  sv   

                        
r   zhThe bare UMT5 Model transformer outputting encoder's raw hidden-states without any specific head on top.c                       s   e Zd ZdZdZdgZ fddZdd Zdd	 Zd
d Z	dd Z
eeeeeddeej eej eej eej ee ee ee eeej ef dddZ  ZS )r   a  
    Examples:

    ```python
    >>> from transformers import UMT5EncoderModel, AutoTokenizer

    >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
    >>> outputs = model(input_ids)
    >>> hidden_state = outputs.last_hidden_state
    ```r  r   c                    sN   t  | t|j|j| _t|}d|_	d|_
t|| j| _|   d S NF)r   r    r   rg   r   rB   r   r   r   r   r   r   r   r   )r&   r=   r   r)   r+   r,   r    S  s    
zUMT5EncoderModel.__init__c                 C   s   | j S rL   r   r   r+   r+   r,   r   `  s    z%UMT5EncoderModel.get_input_embeddingsc                 C   s   || _ | j| d S rL   )r   r   r   r   r+   r+   r,   r   d  s    z%UMT5EncoderModel.set_input_embeddingsc                 C   s   | j S rL   r   r   r+   r+   r,   r   i  s    zUMT5EncoderModel.get_encoderc                 C   s0   |  D ]"\}}| jj| jd j| qdS )r  r   N)r  r   r   r   r   r  r  r+   r+   r,   r  m  s    zUMT5EncoderModel._prune_headsr  N)r   r   r   r   r   r   r   rk   c           	   	   C   s0   |dk	r|n| j j}| j|||||||d}|S )a5  
        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5EncoderModel

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  )r=   r   r   )	r&   r   r   r   r   r   r   r   r	  r+   r+   r,   r5   u  s    
zUMT5EncoderModel.forward)NNNNNNN)r7   r8   r9   r   r  r  r    r   r   r   r  r   UMT5_ENCODER_INPUTS_DOCSTRINGr   r   r  r   r"   r  r  r  r   r   r5   r:   r+   r+   r)   r,   r   <  s6   
       r   z
    UMT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                       s   e Zd ZdgZddgZed fddZeee	e
eddejeej eej eej eej eej eej eeej  eej eej eej ee ee ee ee eee
f d	d
dZ  ZS )UMT5ForSequenceClassificationzFdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightr   r   r<   c                    s2   t  | t|| _t|| _|   d| _d S r/  )r   r    r   r   r   classification_headr   Zmodel_parallelrK   r)   r+   r,   r      s
    

z&UMT5ForSequenceClassification.__init__r  N)r   r   r   r   r   r  r   r	  r   r
  r   r   r   r   r   rk   c                 C   sx  |dk	r|n| j j}|dk	r d}|dkrB|	dk	rBtd| jj |dkrl|
dkrl|dkrbtd| |}| j|||||||||	|
||||d}|d }|| j j	
|j}tt|ddkrtd|j\}}}||ddf |d	|ddd	ddf }| |}d}|dk	r |
|j}| j jdkr| j jdkrRd
| j _n<| j jdkr|jtjks||jtjkrd| j _nd| j _| j jd
krt }| j jdkr|| | }n
|||}nP| j jdkrt }||d	| j j|d	}n| j jdkr t }|||}|sP|f|dd  }|dk	rL|f| S |S t|||j|j|j|j |j!|j"|j#d	S )aD  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Returns:
        NFz8Passing input embeddings is currently not supported for If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   r   r   r   r  r   r	  r   r
  r   r   r   r   r   r   z7All examples must have the same number of <eos> tokens.r.   Z
regressionZsingle_label_classificationZmulti_label_classificationr"  )$r=   r   NotImplementedErrorr*   r7   r   r   r   eqZeos_token_idr/   ry   r   r"   Zunique_consecutivesumr   rm   r2  Zproblem_typer   r2   rp   intr	   squeezer   r   r   r   r  r  r   r  r}   r  )r&   r   r   r   r   r   r  r   r	  r   r
  r   r   r   r   r   r   r%  Zeos_maskr   r   r'   Zsentence_representationr$  r#  r&  r'  r+   r+   r,   r5     s    
,


*

z%UMT5ForSequenceClassification.forward)NNNNNNNNNNNNNNN)r7   r8   r9   Z"_keys_to_ignore_on_load_unexpectedr  r   r    r   r  r   r   r  r"   r  r   rN   r   r  r  r   r   r5   r:   r+   r+   r)   r,   r1    sL   

               
r1  z
    UMT5 Model with a span classification head on top for extractive question-answering tasks like SQuAD (linear layers
    on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       s   e Zd ZddgZ fddZdd Zdd Zd	d
 Zdd Ze	e
eeeddeej eej eej eej eej eej eej eeeej   eej eej eej eej ee ee ee ee eeej ef dddZ  ZS )r   r   r   c                    s   t  | |j| _t|j|j| _t	|}d|_
d|_d|_t|| j| _t	|}d|_
d|_|j|_t|| j| _|j| _t|j|j| _|   d S r   )r   r    rB   r  r   rg   r   r   r   r   r[   r   r   r   r   r   r   r   r   rA   r   r   r   r)   r+   r,   r    1  s     

z!UMT5ForQuestionAnswering.__init__c                 C   s   | j S rL   r   r   r+   r+   r,   r   J  s    z-UMT5ForQuestionAnswering.get_input_embeddingsc                 C   s"   || _ | j| | j| d S rL   r   r   r+   r+   r,   r   N  s    z-UMT5ForQuestionAnswering.set_input_embeddingsc                 C   s   | j S rL   r   r   r+   r+   r,   r   T  s    z$UMT5ForQuestionAnswering.get_encoderc                 C   s   | j S rL   r   r   r+   r+   r,   r   X  s    z$UMT5ForQuestionAnswering.get_decoderr  N)r   r   r   r   r   r  r   r	  start_positionsend_positionsr   r
  r   r   r   r   rk   c                 C   sn  |dk	r|n| j j}|dk	r |n| j j}|	dk	r<|
dk	r<d}|dkrf|dkrf|dkr\td| |}|dk	rr|n| j j}|dk	r|n| j j}|dkr| j|||||||d}nH|rt|tst|d t|dkr|d ndt|dkr|d ndd}|d }| j	|||d||||||||d	}|d }| 
|}|jdd
d\}}|d
 }|d
 }d}|	dk	r|
dk	rt|	 dkr|	d
|j}	t|
 dkr|
d
|j}
|d}|	d|}	|
d|}
t|d}|||	}|||
}|| d }|sD||f|dd  | }|dk	r@|f| S |S t||||j|j|j|j|j|j|jd
S )a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
            are not taken into account for computing the loss.
        Returns:
        NFr3  r  r   r   r-   r  r  r.   r   r!  )
r#  start_logits
end_logitsr   r  r  r   r  r}   r  )r=   r   r   r   r   r   rM   r   r   r   r   splitr8  r   rl   r/   ry   r   r   r   r   r4   r   r   r   )r&   r   r   r   r   r   r  r   r	  r9  r:  r   r
  r   r   r   r   r4   r  r%  r$  r;  r<  Z
total_lossZignored_indexr&  Z
start_lossZend_lossr'  r+   r+   r,   r5   [  s     
	




z UMT5ForQuestionAnswering.forward)NNNNNNNNNNNNNNNN)r7   r8   r9   r  r    r   r   r   r   r   r  r   r   r  r   r"   r  r  r  rN   r   r  r   r5   r:   r+   r+   r)   r,   r   '  sV   
                r   )>r   r   ru   typingr   r   r   r   r"   r   Ztorch.nnr   r   r	   Ztorch.utils.checkpointr
   Zactivationsr   Zmodeling_outputsr   r   r   r   r   r   Zmodeling_utilsr   utilsr   r   r   r   r   r   r   Zconfiguration_umt5r   Z
get_loggerr7   r   r  Z_CHECKPOINT_FOR_DOCModuler   r;   rR   rU   rZ   r   r   r   r   r   r   ZUMT5_START_DOCSTRINGr  r0  r   r   r   r1  r   r+   r+   r+   r,   <module>   sn    $	
 (Su >]& & oa