U
    ,-e.                     @   s  d Z ddlmZ ddlmZmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlm Z  e!e"Z#dZ$dZ%dZ&dZ'dZ(dZ)dZ*dZ+dZ,dZ-ddddddgZ.G dd de	j/Z0G d d! d!ej1j2Z3G d"d# d#e4Z5d$d% Z6G d&d' d'ej1j2Z7G d(d) d)e	j/Z8G d*d+ d+e	j/Z9G d,d- d-e	j/Z:G d.d/ d/e	j/Z;G d0d1 d1e	j/Z<G d2d3 d3e	j/Z=G d4d5 d5e	j/Z>G d6d7 d7e	j/Z?d8d9 Z@ejAjBd:d; ZCejAjBd<d= ZDejAjBd>d? ZEG d@dA dAe	j/ZFG dBdC dCe	j/ZGG dDdE dEeZHdFZIdGZJedHeIG dIdJ dJeHZKedKeIG dLdM dMeHZLG dNdO dOe	j/ZMG dPdQ dQe	j/ZNG dRdS dSe	j/ZOedTeIG dUdV dVeHZPedWeIG dXdY dYeHZQedZeIG d[d\ d\eHZRdS )]z PyTorch DeBERTa model.    )Sequence)OptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputMaskedLMOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)softmax_backward_data)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )DebertaConfigr   zmicrosoft/deberta-basez!lsanochkin/deberta-large-feedbackz' Paris'z0.54z#Palak/microsoft_deberta-large_squadz' a nice puppet'gQ?      zmicrosoft/deberta-largezmicrosoft/deberta-xlargezmicrosoft/deberta-base-mnlizmicrosoft/deberta-large-mnlizmicrosoft/deberta-xlarge-mnlic                       s0   e Zd Z fddZdd Zedd Z  ZS )ContextPoolerc                    s2   t    t|j|j| _t|j| _|| _	d S N)
super__init__r   LinearZpooler_hidden_sizedenseStableDropoutZpooler_dropoutdropoutconfigselfr#   	__class__ m/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/deberta/modeling_deberta.pyr   C   s    
zContextPooler.__init__c                 C   s8   |d d df }|  |}| |}t| jj |}|S Nr   )r"   r    r   r#   Zpooler_hidden_act)r%   hidden_statesZcontext_tokenpooled_outputr(   r(   r)   forwardI   s
    

zContextPooler.forwardc                 C   s   | j jS r   )r#   hidden_sizer%   r(   r(   r)   
output_dimS   s    zContextPooler.output_dim)__name__
__module____qualname__r   r-   propertyr0   __classcell__r(   r(   r&   r)   r   B   s   
r   c                   @   s4   e Zd ZdZedd Zedd Zedd ZdS )	XSoftmaxa  
    Masked Softmax which is optimized for saving memory

    Args:
        input (`torch.tensor`): The input tensor that will apply softmax.
        mask (`torch.IntTensor`):
            The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
        dim (int): The dimension that will apply softmax

    Example:

    ```python
    >>> import torch
    >>> from transformers.models.deberta.modeling_deberta import XSoftmax

    >>> # Make a tensor
    >>> x = torch.randn([4, 20, 100])

    >>> # Create a mask
    >>> mask = (x > 0).int()

    >>> # Specify the dimension to apply softmax
    >>> dim = -1

    >>> y = XSoftmax.apply(x, mask, dim)
    ```c                 C   sX   || _ |tj }||tt|jj}t	|| j }|
|d | | |S r*   )dimtotorchboolmasked_filltensorfinfodtypeminsoftmaxZmasked_fill_save_for_backward)r%   inputmaskr7   Zrmaskoutputr(   r(   r)   r-   t   s    
zXSoftmax.forwardc                 C   s$   | j \}t| ||| j|}|d d fS r   )saved_tensorsr   r7   )r%   grad_outputrD   Z	inputGradr(   r(   r)   backward   s    zXSoftmax.backwardc           
      C   s   dd l m  m} ddlm}m} | jd||jd d}| jd| d| jdtj	dtj
d	d
||jd d}|| ||| jdt	t|  jd
}	|| |	|}	|| |	|| jdtj	dtjd	d
S )Nr   )r;   r@   ZCastLong)Zto_iSubConstantr   r>   )Zvalue_tZBool)Ztorch.onnx.symbolic_helperZonnxZsymbolic_helperZtorch.onnx.symbolic_opset9r;   r@   opZcast_pytorch_to_onnxr9   r<   Zint64r=   typer>   r?   r:   )
gr%   rC   r7   Zsym_helpr;   r@   Zmask_cast_valueZr_maskrD   r(   r(   r)   symbolic   s     "   "zXSoftmax.symbolicN)r1   r2   r3   __doc__staticmethodr-   rG   rO   r(   r(   r(   r)   r6   X   s   


r6   c                   @   s   e Zd Zdd ZdS )DropoutContextc                 C   s   d| _ d | _d| _d| _d S )Nr   r   T)r"   rC   scale
reuse_maskr/   r(   r(   r)   r      s    zDropoutContext.__init__N)r1   r2   r3   r   r(   r(   r(   r)   rR      s   rR   c                 C   s   t |ts|}d }n |j}||j9 }|jr0|jnd }|dkrd|d krddt| d|  	tj
}t |tr~|jd kr~||_||fS )Nr   r   )
isinstancerR   r"   rS   rT   rC   r9   Z
empty_likeZ
bernoulli_r8   r:   )rB   Zlocal_contextr"   rC   r(   r(   r)   get_mask   s    

 

rV   c                   @   sT   e Zd ZdZedd Zedd Zeejj	ejj
eeef ejj
dddZd	S )
XDropoutzlOptimized dropout function to save computation and memory by using mask operation instead of multiplication.c                 C   sH   t ||\}}dd|  | _|dkr@| | ||d| j S |S d S )Ng      ?r   r   )rV   rS   rA   r;   )ctxrB   	local_ctxrC   r"   r(   r(   r)   r-      s    
zXDropout.forwardc                 C   s4   | j dkr(| j\}||d| j  d fS |d fS d S )Nr   r   )rS   rE   r;   )rX   rF   rC   r(   r(   r)   rG      s    
zXDropout.backward)rN   rB   rY   returnc                 C   s4   ddl m} |}t|tr |j}d}|| |||S )Nr   )symbolic_opset12T)Z
torch.onnxr[   rU   rR   r"   )rN   rB   rY   r[   Z	dropout_ptrainr(   r(   r)   rO      s    
zXDropout.symbolicN)r1   r2   r3   rP   rQ   r-   rG   r9   Z_CZGraphValuer   floatrR   rO   r(   r(   r(   r)   rW      s   
	
rW   c                       sB   e Zd ZdZ fddZdd Zdd Zdd
dZdd Z  Z	S )r!   z
    Optimized dropout module for stabilizing the training

    Args:
        drop_prob (float): the dropout probabilities
    c                    s    t    || _d| _d | _d S r*   )r   r   	drop_probcountcontext_stack)r%   r_   r&   r(   r)   r      s    
zStableDropout.__init__c                 C   s$   | j r | jdkr t||  S |S )zr
        Call the module

        Args:
            x (`torch.tensor`): The input tensor to apply dropout
        r   )trainingr_   rW   applyget_context)r%   xr(   r(   r)   r-      s    zStableDropout.forwardc                 C   s   d| _ d | _d S r*   )r`   ra   r/   r(   r(   r)   clear_context   s    zStableDropout.clear_contextTr   c                 C   s2   | j d krg | _ d| _| j D ]}||_||_qd S r*   )ra   r`   rT   rS   )r%   rT   rS   cr(   r(   r)   init_context   s    

zStableDropout.init_contextc                 C   sX   | j d k	rN| jt| j kr(| j t  | j | j }| j|_|  jd7  _|S | jS d S )Nr   )ra   r`   lenappendrR   r_   r"   )r%   rX   r(   r(   r)   rd      s    
zStableDropout.get_context)Tr   )
r1   r2   r3   rP   r   r-   rf   rh   rd   r5   r(   r(   r&   r)   r!      s   
r!   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )DebertaLayerNormzBLayerNorm module in the TF style (epsilon inside the square root).-q=c                    s8   t    tt|| _tt|| _|| _	d S r   )
r   r   r   	Parameterr9   onesweightzerosbiasvariance_epsilon)r%   sizeepsr&   r(   r)   r     s    
zDebertaLayerNorm.__init__c                 C   sj   |j }| }|jddd}|| djddd}|| t|| j  }||}| j| | j	 }|S )NT)Zkeepdim   )
r>   r^   meanpowr9   sqrtrr   r8   ro   rq   )r%   r+   Z
input_typerw   Zvarianceyr(   r(   r)   r-     s    
zDebertaLayerNorm.forward)rl   r1   r2   r3   rP   r   r-   r5   r(   r(   r&   r)   rk   
  s   rk   c                       s$   e Zd Z fddZdd Z  ZS )DebertaSelfOutputc                    s<   t    t|j|j| _t|j|j| _t	|j
| _d S r   )r   r   r   r   r.   r    rk   layer_norm_eps	LayerNormr!   hidden_dropout_probr"   r$   r&   r(   r)   r     s    
zDebertaSelfOutput.__init__c                 C   s&   |  |}| |}| || }|S r   r    r"   r~   r%   r+   Zinput_tensorr(   r(   r)   r-   %  s    

zDebertaSelfOutput.forwardr1   r2   r3   r   r-   r5   r(   r(   r&   r)   r|     s   r|   c                       s&   e Zd Z fddZdddZ  ZS )DebertaAttentionc                    s(   t    t|| _t|| _|| _d S r   )r   r   DisentangledSelfAttentionr%   r|   rD   r#   r$   r&   r(   r)   r   -  s    


zDebertaAttention.__init__FNc           
      C   sN   | j ||||||d}|r"|\}}|d kr.|}| ||}	|rF|	|fS |	S d S )N)query_statesrelative_posrel_embeddings)r%   rD   )
r%   r+   attention_maskoutput_attentionsr   r   r   Zself_output
att_matrixattention_outputr(   r(   r)   r-   3  s     	zDebertaAttention.forward)FNNNr   r(   r(   r&   r)   r   ,  s   
    r   c                       s0   e Zd Z fddZejejdddZ  ZS )DebertaIntermediatec                    sB   t    t|j|j| _t|jt	r6t
|j | _n|j| _d S r   )r   r   r   r   r.   intermediate_sizer    rU   
hidden_actstrr   intermediate_act_fnr$   r&   r(   r)   r   R  s
    
zDebertaIntermediate.__init__)r+   rZ   c                 C   s   |  |}| |}|S r   )r    r   r%   r+   r(   r(   r)   r-   Z  s    

zDebertaIntermediate.forward)r1   r2   r3   r   r9   Tensorr-   r5   r(   r(   r&   r)   r   Q  s   r   c                       s$   e Zd Z fddZdd Z  ZS )DebertaOutputc                    sB   t    t|j|j| _t|j|j| _	t
|j| _|| _d S r   )r   r   r   r   r   r.   r    rk   r}   r~   r!   r   r"   r#   r$   r&   r(   r)   r   a  s
    
zDebertaOutput.__init__c                 C   s&   |  |}| |}| || }|S r   r   r   r(   r(   r)   r-   h  s    

zDebertaOutput.forwardr   r(   r(   r&   r)   r   `  s   r   c                       s&   e Zd Z fddZdddZ  ZS )DebertaLayerc                    s,   t    t|| _t|| _t|| _d S r   )r   r   r   	attentionr   intermediater   rD   r$   r&   r(   r)   r   p  s    


zDebertaLayer.__init__NFc                 C   sL   | j ||||||d}|r"|\}}| |}	| |	|}
|rD|
|fS |
S d S )Nr   r   r   r   )r   r   rD   )r%   r+   r   r   r   r   r   r   r   Zintermediate_outputZlayer_outputr(   r(   r)   r-   v  s    	
zDebertaLayer.forward)NNNFr   r(   r(   r&   r)   r   o  s   
    r   c                       sD   e Zd ZdZ fddZdd Zdd Zdd	d
ZdddZ  Z	S )DebertaEncoderz8Modified BertEncoder with relative position bias supportc                    s~   t    t fddt jD | _t dd| _| jrtt dd| _	| j	dk r^ j
| _	t| j	d  j| _d| _d S )	Nc                    s   g | ]}t  qS r(   )r   .0_r#   r(   r)   
<listcomp>  s     z+DebertaEncoder.__init__.<locals>.<listcomp>relative_attentionFmax_relative_positionsru   r   rv   )r   r   r   Z
ModuleListrangeZnum_hidden_layerslayergetattrr   r   max_position_embeddings	Embeddingr.   r   gradient_checkpointingr$   r&   r   r)   r     s    
 
zDebertaEncoder.__init__c                 C   s   | j r| jjnd }|S r   )r   r   ro   )r%   r   r(   r(   r)   get_rel_embedding  s    z DebertaEncoder.get_rel_embeddingc                 C   sL   |  dkr2|dd}||dd }n|  dkrH|d}|S )Nrv   r   ru   r
   )r7   	unsqueezesqueeze)r%   r   Zextended_attention_maskr(   r(   r)   get_attention_mask  s    
z!DebertaEncoder.get_attention_maskNc                 C   sB   | j r>|d kr>|d k	r |dn|d}t||d|j}|S )Nr   )r   rs   build_relative_positiondevice)r%   r+   r   r   qr(   r(   r)   get_rel_pos  s    zDebertaEncoder.get_rel_posTFc              	      sT  |  |}| |||}|r dnd } r,dnd }	t|trD|d }
n|}
|  }t| jD ]\}}|rp||f }| jr| jr fdd}t	j
j|||
||||}n||
|||| d} r|\}}|d k	r|}t|tr|d t| jk r||d  nd }
n|}
 rZ|	|f }	qZ|r(||f }|sFtdd |||	fD S t|||	d	S )
Nr(   r   c                    s    fdd}|S )Nc                     s    | f S r   r(   )inputs)moduler   r(   r)   custom_forward  s    zMDebertaEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr(   )r   r   r   )r   r)   create_custom_forward  s    z5DebertaEncoder.forward.<locals>.create_custom_forward)r   r   r   r   r   c                 s   s   | ]}|d k	r|V  qd S r   r(   )r   vr(   r(   r)   	<genexpr>  s      z)DebertaEncoder.forward.<locals>.<genexpr>Zlast_hidden_stater+   
attentions)r   r   rU   r   r   	enumerater   r   rb   r9   utils
checkpointri   tupler   )r%   r+   r   output_hidden_statesr   r   r   return_dictZall_hidden_statesZall_attentionsZnext_kvr   iZlayer_moduler   Zatt_mr(   r   r)   r-     s^    




		
$
  zDebertaEncoder.forward)NN)TFNNT)
r1   r2   r3   rP   r   r   r   r   r-   r5   r(   r(   r&   r)   r     s   	

     r   c                 C   sj   t j| t j|d}t j|t j|d}|dddf |dd| d }|d| ddf }|d}|S )a  
    Build relative position according to the query and key

    We assume the absolute position of query \(P_q\) is range from (0, query_size) and the absolute position of key
    \(P_k\) is range from (0, key_size), The relative positions from query to key is \(R_{q \rightarrow k} = P_q -
    P_k\)

    Args:
        query_size (int): the length of query
        key_size (int): the length of key

    Return:
        `torch.LongTensor`: A tensor with shape [1, query_size, key_size]

    r>   r   Nr   ru   r   )r9   arangelongviewrepeatr   )Z
query_sizeZkey_sizer   Zq_idsZk_idsZrel_pos_idsr(   r(   r)   r     s    $
r   c                 C   s*   |  |d|d|d|dgS )Nr   r   rv   ru   expandrs   )c2p_posquery_layerr   r(   r(   r)   c2p_dynamic_expand  s    r   c                 C   s*   |  |d|d|d|dgS )Nr   r   r   r   )r   r   	key_layerr(   r(   r)   p2c_dynamic_expand  s    r   c                 C   s*   |  | d d | d|df S )Nrv   r   r   )	pos_indexp2c_attr   r(   r(   r)   pos_dynamic_expand  s    r   c                       s:   e Zd ZdZ fddZdd Zddd	Zd
d Z  ZS )r   a  
    Disentangled self-attention module

    Parameters:
        config (`str`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaConfig`]

    c                    s  t    |j|j dkr4td|j d|j d|j| _t|j|j | _| j| j | _tj	|j| jd dd| _
ttj| jtjd| _ttj| jtjd| _|jd k	r|jng | _t|d	d| _t|d
d| _| jrtj	|j|jdd| _tj	|j|jdd| _| jrt|dd| _| jdk r:|j| _t|j| _d| jkrhtj	|j| jdd| _d| jkrt	|j| j| _t|j| _d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r
   Frq   rK   r   talking_headr   ru   r   c2pp2c) r   r   r.   num_attention_heads
ValueErrorintZattention_head_sizeZall_head_sizer   r   in_projrm   r9   rp   r^   q_biasv_biaspos_att_typer   r   r   head_logits_projhead_weights_projr   r   r!   r   pos_dropoutpos_proj
pos_q_projZattention_probs_dropout_probr"   r$   r&   r(   r)   r   /  s6    
z"DisentangledSelfAttention.__init__c                 C   s4   |  d d | jdf }||}|ddddS )Nru   r   rv   r   r
   )rs   r   r   permute)r%   re   Znew_x_shaper(   r(   r)   transpose_for_scoresR  s    
z.DisentangledSelfAttention.transpose_for_scoresFNc                    s\  |dkr.  }|jddd\}}	}
ndd j jjjd ddfdd	tdD dgd d d |jd jd
} fdd	tddD \}}fdd	|||fD \}}	}
|jddddf  }|
j	ddddf  }
d}dt
j }ttj|dtjd
| }||j|jd
 }t||	dd}jr|}||	|||}|dk	r|| }jrƈ|dddddddd}t||d}|}jr|dddddddd}t||
}|dddd }| dd d }||}|rT||fS |S dS )a  
        Call the module

        Args:
            hidden_states (`torch.FloatTensor`):
                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                *Attention(Q,K,V)*

            attention_mask (`torch.BoolTensor`):
                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

            output_attentions (`bool`, optional):
                Whether return the attention matrix.

            query_states (`torch.FloatTensor`, optional):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`torch.LongTensor`):
                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
                values ranging in [*-max_relative_positions*, *max_relative_positions*].

            rel_embeddings (`torch.FloatTensor`):
                The embedding of relative distances. It's a tensor of shape [\(2 \times
                \text{max_relative_positions}\), *hidden_size*].


        Nr
   ru   r7   c                 S   s4   |d k	r t ||  |  S t ||  S d S r   )r9   matmult)wbre   r(   r(   r)   linear  s    z1DisentangledSelfAttention.forward.<locals>.linearr   c                    s0   g | ]( t j fd dtjD ddqS )c                    s   g | ]}|d     qS )r
   r(   r   r   )kwsr(   r)   r     s     z@DisentangledSelfAttention.forward.<locals>.<listcomp>.<listcomp>r   r   )r9   catr   r   )r   )r%   r   )r   r)   r     s     z5DisentangledSelfAttention.forward.<locals>.<listcomp>rK   c                    s.   g | ]&}| |  j | jd qS )rK   )r8   r>   r   )r+   r   qkvbqkvwr(   r)   r     s     r   c                    s   g | ]}  |qS r(   )r   )r   re   r/   r(   r)   r     s     r   rv   ru   )r   r   chunkro   r   r   r8   r>   r   r   ri   r   r9   ry   r<   rs   r^   r   	transposer   r   disentangled_att_biasr   r   r   r6   rc   r"   r   
contiguousr   )r%   r+   r   r   r   r   r   Zqpr   r   Zvalue_layerr   r   r   Zrel_attscale_factorrS   Zattention_scoresZattention_probsZcontext_layerZnew_context_layer_shaper(   )r+   r   r   r   r%   r   r)   r-   W  sF    &

"" 

"
"
z!DisentangledSelfAttention.forwardc                 C   sp  |d kr&| d}t|| d|j}| dkrD|dd}n6| dkr\|d}n| dkrztd|  tt| d| d| j}|	 
|j}|| j| | j| d d f d}d}d| jkr@| |}	| |	}	t||	d	d}
t|| d|d d }tj|
d	t|||d
}
||
7 }d| jkrl| |}| |}|ttj| d	tjd|  }| d| dkrt| d| d|j}n|}t| | d|d d }t||d	dj
|jd}tj|d	t|||d
d	d}| d| dkrd|d d d d d d df d	}tj|dt|||d
}||7 }|S )Nr   rv   r   r
   r      z2Relative position ids must be of dim 2 or 3 or 4. r   ru   )r7   indexr   rK   )rs   r   r   r7   r   r   r?   maxr   r   r8   r   r   r   r9   r   r   clampgatherr   r   ry   r<   r^   r>   r   r   )r%   r   r   r   r   r   r   Zatt_spanZscoreZpos_key_layerZc2p_attr   Zpos_query_layerZr_posZp2c_posr   r   r(   r(   r)   r     s^    
 



$  
 "z/DisentangledSelfAttention.disentangled_att_bias)FNNN)	r1   r2   r3   rP   r   r   r-   r   r5   r(   r(   r&   r)   r   $  s   
#	    
[r   c                       s*   e Zd ZdZ fddZdddZ  ZS )DebertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    t|dd}t|d|j| _tj|j| j|d| _t|dd| _	| j	sXd | _
nt|j| j| _
|jdkrt|j| j| _| j|jkrtj| j|jdd| _t|j|j| _t|j| _|| _| jd	t|jd
dd d S )Npad_token_idr   embedding_size)padding_idxposition_biased_inputTFr   position_ids)r   ru   )
persistent)r   r   r   r.   r   r   r   
vocab_sizeword_embeddingsr   position_embeddingsr   type_vocab_sizetoken_type_embeddingsr   
embed_projrk   r}   r~   r!   r   r"   r#   Zregister_bufferr9   r   r   )r%   r#   r   r&   r(   r)   r     s(    

  zDebertaEmbeddings.__init__Nc                 C   sT  |d k	r|  }n|  d d }|d }|d krH| jd d d |f }|d krftj|tj| jjd}|d krx| |}| jd k	r| | }n
t|}|}	| j	r|	|7 }	| j
jdkr| |}
|	|
7 }	| j| j
jkr| |	}	| |	}	|d k	rF| |	 kr2| dkr(|dd}|d}||	j}|	| }	| |	}	|	S )Nru   r   r   r   r   rv   )rs   r   r9   rp   r   r   r   r  Z
zeros_liker   r#   r  r  r   r.   r  r~   r7   r   r   r8   r>   r"   )r%   	input_idstoken_type_idsr   rC   inputs_embedsinput_shapeZ
seq_lengthr  
embeddingsr  r(   r(   r)   r-     s>    









zDebertaEmbeddings.forward)NNNNNr{   r(   r(   r&   r)   r     s   r   c                   @   s4   e Zd ZdZeZdZdgZdZdd Z	ddd	Z
d
S )DebertaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    debertar  Tc                 C   s|   t |tjr:|jjjd| jjd |jdk	rx|jj	  n>t |tj
rx|jjjd| jjd |jdk	rx|jj|j 	  dS )zInitialize the weights.g        )rw   ZstdN)rU   r   r   ro   dataZnormal_r#   Zinitializer_rangerq   Zzero_r   r   )r%   r   r(   r(   r)   _init_weights=  s    

z$DebertaPreTrainedModel._init_weightsFc                 C   s   t |tr||_d S r   )rU   r   r   )r%   r   valuer(   r(   r)   _set_gradient_checkpointingJ  s    
z2DebertaPreTrainedModel._set_gradient_checkpointingN)F)r1   r2   r3   rP   r   config_classZbase_model_prefixZ"_keys_to_ignore_on_load_unexpectedZsupports_gradient_checkpointingr  r  r(   r(   r(   r)   r
  2  s   r
  a  
    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.


    Parameters:
        config ([`DebertaConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zaThe bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Z fddZdd Zdd Zdd Zee	d	e
eeed
deej eej eej eej eej ee ee ee eeef d	ddZ  ZS )DebertaModelc                    s8   t  | t|| _t|| _d| _|| _|   d S r*   )	r   r   r   r	  r   encoderz_stepsr#   	post_initr$   r&   r(   r)   r     s    

zDebertaModel.__init__c                 C   s   | j jS r   r	  r   r/   r(   r(   r)   get_input_embeddings  s    z!DebertaModel.get_input_embeddingsc                 C   s   || j _d S r   r  r%   Znew_embeddingsr(   r(   r)   set_input_embeddings  s    z!DebertaModel.set_input_embeddingsc                 C   s   t ddS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        z7The prune function is not implemented in DeBERTa model.N)NotImplementedError)r%   Zheads_to_pruner(   r(   r)   _prune_heads  s    zDebertaModel._prune_headsbatch_size, sequence_lengthr   output_typer  N)	r  r   r  r   r  r   r   r   rZ   c	              	      s  |d k	r|n j j}|d k	r |n j j}|d k	r4|n j j}|d k	rV|d k	rVtdn@|d k	rt || | }	n"|d k	r| d d }	ntd|d k	r|jn|j}
|d krtj	|	|
d}|d krtj
|	tj|
d} j|||||d} j||d||d}|d	 } jd	kr|d
 } fddt jD }|d } j } j|} j|}|d	d  D ]$}|||d|||d}|| ql|d }|s|f||rd	ndd   S t||r|jnd |jdS )NzDYou cannot specify both input_ids and inputs_embeds at the same timeru   z5You have to specify either input_ids or inputs_embeds)r   r   )r  r  r   rC   r  T)r   r   r   r   r   c                    s   g | ]} j jd  qS r   )r  r   r   r/   r(   r)   r     s     z(DebertaModel.forward.<locals>.<listcomp>Fr   rv   r   )r#   r   r   use_return_dictr   Z%warn_if_padding_and_no_attention_maskrs   r   r9   rn   rp   r   r	  r  r  r   r   r   r   rj   r   r+   r   )r%   r  r   r  r   r  r   r   r   r  r   Zembedding_outputZencoder_outputsZencoded_layersr+   Zlayersr   r   Zrel_posr   sequence_outputr(   r/   r)   r-     sr    


zDebertaModel.forward)NNNNNNNN)r1   r2   r3   r   r  r  r  r   DEBERTA_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r9   r   r:   r   r   r-   r5   r(   r(   r&   r)   r    s:   
        
r  z5DeBERTa Model with a `language modeling` head on top.c                       s   e Zd ZddgZ fddZdd Zdd Zee	d	e
eeed
eeddeej eej eej eej eej eej ee ee ee eeef d
ddZ  ZS )DebertaForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                    s,   t  | t|| _t|| _|   d S r   )r   r   r  r  DebertaOnlyMLMHeadclsr  r$   r&   r(   r)   r     s    

zDebertaForMaskedLM.__init__c                 C   s
   | j jjS r   r&  predictionsdecoderr/   r(   r(   r)   get_output_embeddings  s    z(DebertaForMaskedLM.get_output_embeddingsc                 C   s   || j j_d S r   r'  r  r(   r(   r)   set_output_embeddings  s    z(DebertaForMaskedLM.set_output_embeddingsr  z[MASK])r   r  r  rC   expected_outputexpected_lossN
r  r   r  r   r  labelsr   r   r   rZ   c
              
   C   s   |	dk	r|	n| j j}	| j||||||||	d}
|
d }| |}d}|dk	rnt }||d| j j|d}|	s|f|
dd  }|dk	r|f| S |S t|||
j|
j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r  r   r  r   r   r   r   ru   r   losslogitsr+   r   )
r#   r  r  r&  r   r   r   r   r+   r   )r%   r  r   r  r   r  r/  r   r   r   outputsr  prediction_scoresZmasked_lm_lossloss_fctrD   r(   r(   r)   r-     s4    
zDebertaForMaskedLM.forward)	NNNNNNNNN)r1   r2   r3   Z_tied_weights_keysr   r*  r+  r   r   r!  r   _CHECKPOINT_FOR_MASKED_LMr   r#  _MASKED_LM_EXPECTED_OUTPUT_MASKED_LM_EXPECTED_LOSSr   r9   r   r:   r   r   r-   r5   r(   r(   r&   r)   r$    sD   	
         
r$  c                       s$   e Zd Z fddZdd Z  ZS )DebertaPredictionHeadTransformc                    sf   t    t|d|j| _t|j| j| _t|j	t
rFt|j	 | _n|j	| _tj| j|jd| _d S )Nr   )rt   )r   r   r   r.   r   r   r   r    rU   r   r   r   transform_act_fnr~   r}   r$   r&   r(   r)   r   P  s    
z'DebertaPredictionHeadTransform.__init__c                 C   s"   |  |}| |}| |}|S r   )r    r;  r~   r   r(   r(   r)   r-   [  s    


z&DebertaPredictionHeadTransform.forwardr   r(   r(   r&   r)   r:  O  s   r:  c                       s$   e Zd Z fddZdd Z  ZS )DebertaLMPredictionHeadc                    s\   t    t|| _t|d|j| _tj| j|j	dd| _
tt|j	| _| j| j
_d S )Nr   Fr   )r   r   r:  	transformr   r.   r   r   r   r   r)  rm   r9   rp   rq   r$   r&   r(   r)   r   c  s    

z DebertaLMPredictionHead.__init__c                 C   s   |  |}| |}|S r   )r=  r)  r   r(   r(   r)   r-   q  s    

zDebertaLMPredictionHead.forwardr   r(   r(   r&   r)   r<  b  s   r<  c                       s$   e Zd Z fddZdd Z  ZS )r%  c                    s   t    t|| _d S r   )r   r   r<  r(  r$   r&   r(   r)   r   y  s    
zDebertaOnlyMLMHead.__init__c                 C   s   |  |}|S r   )r(  )r%   r  r5  r(   r(   r)   r-   }  s    
zDebertaOnlyMLMHead.forwardr   r(   r(   r&   r)   r%  x  s   r%  z
    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       s   e Zd Z fddZdd Zdd Zeede	e
eeddeej eej eej eej eej eej ee ee ee eeef d

ddZ  ZS ) DebertaForSequenceClassificationc                    s~   t  | t|dd}|| _t|| _t|| _| jj}t	
||| _t|dd }|d krd| jjn|}t|| _|   d S )N
num_labelsrv   Zcls_dropout)r   r   r   r?  r  r  r   poolerr0   r   r   
classifierr#   r   r!   r"   r  )r%   r#   r?  r0   Zdrop_outr&   r(   r)   r     s    


z)DebertaForSequenceClassification.__init__c                 C   s
   | j  S r   )r  r  r/   r(   r(   r)   r    s    z5DebertaForSequenceClassification.get_input_embeddingsc                 C   s   | j | d S r   )r  r  r  r(   r(   r)   r    s    z5DebertaForSequenceClassification.set_input_embeddingsr  r  Nr.  c
              
   C   sJ  |	dk	r|	n| j j}	| j||||||||	d}
|
d }| |}| |}| |}d}|dk	r| j jdkrx| jdkrt	 }|
d|j}|||
d}n| dks|ddkrT|dk }| }|ddkrBt|d||d|d}t|d|
d}t }||
d| j |
d}ntd|}n"td}||| d  }n| j jdkrt	 }| jdkr|| | }n
|||}nN| j jdkrt }||
d| j|
d}n| j jdkrt }|||}|	s6|f|
dd  }|dk	r2|f| S |S t|||
j|
jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r  r   r   r  r   r   r   r   r   ru   Z
regressionZsingle_label_classificationZmulti_label_classificationr1  )r#   r  r  r@  r"   rA  Zproblem_typer?  r   r	   r   r8   r>   r7   rs   Znonzeror   r9   r   r   r   r^   r<   Z
LogSoftmaxsumrw   r   r   r   r+   r   )r%   r  r   r  r   r  r/  r   r   r   r4  Zencoder_layerr,   r3  r2  Zloss_fnZlabel_indexZlabeled_logitsr6  Zlog_softmaxrD   r(   r(   r)   r-     sr    




   

   z(DebertaForSequenceClassification.forward)	NNNNNNNNN)r1   r2   r3   r   r  r  r   r   r!  r   r"  r   r#  r   r9   r   r:   r   r   r-   r5   r(   r(   r&   r)   r>    s<            
r>  z
    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       s   e Zd Z fddZeedeee	e
dd	eej eej eej eej eej eej ee ee ee eee	f d
ddZ  ZS )
DebertaForTokenClassificationc                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S r   )r   r   r?  r  r  r   ZDropoutr   r"   r   r.   rA  r  r$   r&   r(   r)   r      s    
z&DebertaForTokenClassification.__init__r  r  Nr.  c
              
   C   s   |	dk	r|	n| j j}	| j||||||||	d}
|
d }| |}| |}d}|dk	rvt }||d| j|d}|	s|f|
dd  }|dk	r|f| S |S t|||
j	|
j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr0  r   ru   r   r1  )r#   r  r  r"   rA  r   r   r?  r   r+   r   )r%   r  r   r  r   r  r/  r   r   r   r4  r  r3  r2  r6  rD   r(   r(   r)   r-     s6    

   z%DebertaForTokenClassification.forward)	NNNNNNNNN)r1   r2   r3   r   r   r   r!  r   r"  r   r#  r   r9   r   r:   r   r   r-   r5   r(   r(   r&   r)   rC    s8            
rC  z
    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       s   e Zd Z fddZeedeee	e
eeeedd	eej eej eej eej eej eej eej ee ee ee eee	f dddZ  ZS )
DebertaForQuestionAnsweringc                    s<   t  | |j| _t|| _t|j|j| _| 	  d S r   )
r   r   r?  r  r  r   r   r.   
qa_outputsr  r$   r&   r(   r)   r   I  s
    
z$DebertaForQuestionAnswering.__init__r  )r   r  r  r,  r-  Zqa_target_start_indexZqa_target_end_indexN)r  r   r  r   r  start_positionsend_positionsr   r   r   rZ   c              
   C   sN  |
dk	r|
n| j j}
| j|||||||	|
d}|d }| |}|jddd\}}|d }|d }d}|dk	r|dk	rt| dkr|d}t| dkr|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|
s8||f|dd  }|dk	r4|f| S |S t||||j|jd	S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nr0  r   r   ru   r   )Zignore_indexrv   )r2  start_logits
end_logitsr+   r   )r#   r  r  rE  splitr   r   ri   rs   r   r   r   r+   r   )r%   r  r   r  r   r  rF  rG  r   r   r   r4  r  r3  rH  rI  Z
total_lossZignored_indexr6  Z
start_lossZend_lossrD   r(   r(   r)   r-   S  sN    !






z#DebertaForQuestionAnswering.forward)
NNNNNNNNNN)r1   r2   r3   r   r   r   r!  r   _CHECKPOINT_FOR_QAr   r#  _QA_EXPECTED_OUTPUT_QA_EXPECTED_LOSS_QA_TARGET_START_INDEX_QA_TARGET_END_INDEXr   r9   r   r:   r   r   r-   r5   r(   r(   r&   r)   rD  A  sD   
          
rD  )SrP   collections.abcr   typingr   r   r   r9   Ztorch.utils.checkpointr   Ztorch.nnr   r   r	   Zactivationsr   Zmodeling_outputsr   r   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   r   r   r   Zconfiguration_debertar   Z
get_loggerr1   loggerr#  r"  r7  r8  r9  rK  rL  rM  rN  rO  Z%DEBERTA_PRETRAINED_MODEL_ARCHIVE_LISTModuler   ZautogradFunctionr6   objectrR   rV   rW   r!   rk   r|   r   r   r   r   r   r   Zjitscriptr   r   r   r   r   r
  ZDEBERTA_START_DOCSTRINGr   r  r$  r:  r<  r%  r>  rC  rD  r(   r(   r(   r)   <module>   s   

?'1%"k


 CL,oO
oB