U
    ,-e                  
   @   st  d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( e&)e*Z+dZ,dZ-dZ.dgZ/dd Z0da1e% rde$ rde+2d z
e0  W n4 e3k
r` Z4 ze+5de4  W 5 dZ4[4X Y nX n dd Z6d\ddZ7d]ddZ8d^ddZ9d d! Z:G d"d# d#ej;j<Z=G d$d% d%ej;j<Z>G d&d' d'Z?d_d(d)Z@d*d+ ZAd`d,d-ZBG d.d/ d/e
jCZDG d0d1 d1e
jCZEG d2d3 d3e
jCZFG d4d5 d5e
jCZGG d6d7 d7e
jCZHG d8d9 d9e
jCZIG d:d; d;e
jCZJG d<d= d=e
jCZKG d>d? d?e
jCZLG d@dA dAe
jCZMG dBdC dCe
jCZNG dDdE dEeZOdFZPdGZQe"dHePG dIdJ dJeOZRe"dKePG dLdM dMeOZSG dNdO dOe
jCZTe"dPePG dQdR dReOZUe"dSePG dTdU dUeOZVe"dVePG dWdX dXeOZWe"dYePG dZd[ d[eOZXdS )az PyTorch MRA model.    N)Path)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss)load   )ACT2FN)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardis_ninja_availableis_torch_cuda_availablelogging   )	MraConfigzuw-madison/mra-base-512-4r   ZAutoTokenizerc                     sN   t t jjjd d   fdd} | dddg}td|d	d
add lad S )NZkernelsmrac                    s    fdd| D S )Nc                    s   g | ]} | qS  r    ).0fileZ
src_folderr    e/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/mra/modeling_mra.py
<listcomp>C   s     z:load_cuda_kernels.<locals>.append_root.<locals>.<listcomp>r    )filesr#   r    r$   append_rootB   s    z&load_cuda_kernels.<locals>.append_rootzcuda_kernel.cuzcuda_launch.cuztorch_extension.cppcuda_kernelT)verboser   )r   __file__resolveparentr
   r(   )r'   Z	src_filesr    r#   r$   load_cuda_kernels>   s
    r-   zLoading custom CUDA kernels...zFailed to load CUDA kernels. Mra requires custom CUDA kernels. Please verify that compatible versions of PyTorch and CUDA Toolkit are installed: c                 C   s   t |  dkrtdt | dkr0td| ddkrFtd| ddkr\td| jd	d
jdd	}| }| }| }t	||||\}}|dd	dddddddf }||fS )z8
    Computes maximum values for softmax stability.
       z.sparse_qk_prod must be a 4-dimensional tensor.   'indices must be a 2-dimensional tensor.    z>The size of the second dimension of sparse_qk_prod must be 32.r   z=The size of the third dimension of sparse_qk_prod must be 32.dimN)
lensize
ValueErrormaxvalues	transpose
contiguousintr(   Z	index_max)sparse_qk_prodindicesquery_num_blockkey_num_blockZ
index_valsmax_valsmax_vals_scatterr    r    r$   
sparse_max]   s    $rD   r1   c                 C   s   t |  dkrtdt | dkr0td| jd |jd krLtd| j\}}|| }tj|dtj|jd}| |||} | |dddf ||  ddf } | S )zN
    Converts attention mask to a sparse mask for high resolution logits.
    r/   z$mask must be a 2-dimensional tensor.r0   r   zBmask and indices must have the same size in the zero-th dimension.dtypedeviceN)	r6   r7   r8   shapetorcharangelongrG   reshape)maskr?   
block_size
batch_sizeseq_len	num_block	batch_idxr    r    r$   sparse_masky   s    
&rS   c           	      C   s"  |   \}}}|  \}}}|| dkr0td|| dkrDtd| ||| ||dd} |||| ||dd}t|   dkrtdt|  dkrtdt|  d	krtd
|  ddkrtd| ddkrtd|  } | }| }| }t| || S )z7
    Performs Sampled Dense Matrix Multiplication.
    r   zTquery_size (size of first dimension of dense_query) must be divisible by block_size.Pkey_size (size of first dimension of dense_key) must be divisible by block_size.r5   r2   r.   z+dense_query must be a 4-dimensional tensor.)dense_key must be a 4-dimensional tensor.r/   r0   r   r1   z.The third dimension of dense_query must be 32.z,The third dimension of dense_key must be 32.)	r7   r8   rL   r;   r6   r<   r=   r(   mm_to_sparse)	dense_query	dense_keyr?   rN   rO   Z
query_sizer4   _key_sizer    r    r$   rV      s.    rV   c           	      C   s  |  \}}}|| dkr"td|  d|kr8td|  d|krNtd|||| ||dd}t|   d	krtd
t|  d	krtdt|  dkrtd| ddkrtd|  } | }| }| }t| |||}|dd||| |}|S )zP
    Performs matrix multiplication of a sparse matrix with a dense matrix.
    r   rT   r/   zQThe size of the second dimension of sparse_query must be equal to the block_size.r   zPThe size of the third dimension of sparse_query must be equal to the block_size.r5   r2   r.   ,sparse_query must be a 4-dimensional tensor.rU   r0   r1   z8The size of the third dimension of dense_key must be 32.)	r7   r8   rL   r;   r6   r<   r=   r(   sparse_dense_mm)	sparse_queryr?   rX   r@   rN   rO   rZ   r4   Zdense_qk_prodr    r    r$   r\      s.    r\   c                 C   s    | | | t j| |dd  S )NfloorZrounding_mode)rI   divrK   )r?   Zdim_1_blockZdim_2_blockr    r    r$   transpose_indices   s    ra   c                   @   s2   e Zd Zedd Zedd Zed	ddZdS )
MraSampledDenseMatMulc                 C   s&   t ||||}| ||| || _|S N)rV   save_for_backwardrN   )ctxrW   rX   r?   rN   r>   r    r    r$   forward   s    zMraSampledDenseMatMul.forwardc                 C   sj   | j \}}}| j}|d| }|d| }t|||}t|dd|||}	t||||}
|
|	d d fS Nr   r5   r2   )saved_tensorsrN   r7   ra   r\   r;   )re   gradrW   rX   r?   rN   r@   rA   	indices_Tgrad_key
grad_queryr    r    r$   backward   s    zMraSampledDenseMatMul.backwardr1   c                 C   s   t | |||S rc   )rb   apply)rW   rX   r?   rN   r    r    r$   operator_call   s    z#MraSampledDenseMatMul.operator_callN)r1   __name__
__module____qualname__staticmethodrf   rm   ro   r    r    r    r$   rb      s   


rb   c                   @   s0   e Zd Zedd Zedd Zedd ZdS )MraSparseDenseMatMulc                 C   s&   t ||||}| ||| || _|S rc   )r\   rd   r@   )re   r]   r?   rX   r@   r>   r    r    r$   rf      s    zMraSparseDenseMatMul.forwardc           
      C   s`   | j \}}}| j}|d|d }t|||}t|dd|||}t|||}	|	d |d fS rg   )rh   r@   r7   ra   r\   r;   rV   )
re   ri   r]   r?   rX   r@   rA   rj   rk   rl   r    r    r$   rm     s    zMraSparseDenseMatMul.backwardc                 C   s   t | |||S rc   )ru   rn   )r]   r?   rX   r@   r    r    r$   ro     s    z"MraSparseDenseMatMul.operator_callNrp   r    r    r    r$   ru      s   

	ru   c                   @   s   e Zd Zedd ZdS )MraReduceSumc                 C   s  |   \}}}}t|   dkr(tdt|  dkr@td|   \}}}}|  \}}| jdd|| |} tj| dtj|jd}tj	||dd	 |d d d f |  || }	tj
|| |f| j| jd}
|
d|	| |||}|||| }|S )
Nr.   r[   r/   r0   r3   r   rE   r^   r_   )r7   r6   r8   sumrL   rI   rJ   rK   rG   r`   zerosrF   Z	index_add)r]   r?   r@   rA   rO   rQ   rN   rY   rR   Zglobal_idxestempoutputr    r    r$   ro     s*    &
  zMraReduceSum.operator_callN)rq   rr   rs   rt   ro   r    r    r    r$   rv     s   rv   c                 C   s  |   \}}}|| }d}	|dk	r||||jdd}
| ||||jdd|
dddddf d  }|||||jdd|
dddddf d  }|dk	r|||||jdd|
dddddf d  }	nl|tj||tj| jd }
| ||||jdd}|||||jdd}|dk	rD|||||jdd}	t||	ddt
| }|jdddj}|dk	r|d	|
dddddf |
dddddf  d
k    }||
||	fS )z/
    Compute low resolution approximation.
    Nr5   r3   r2   ư>rE   T)r4   Zkeepdims     @g      ?)r7   rL   rw   rI   onesfloatrG   meanmatmulr;   mathsqrtr9   r:   )querykeyrN   rM   valuerO   rP   head_dimnum_block_per_row	value_hattoken_countZ	query_hatZkey_hatlow_resolution_logitlow_resolution_logit_row_maxr    r    r$   get_low_resolution_logit0  s4    

:r   c                 C   sT  | j \}}}|dkrf|d }tj||| jd}	tjtj|	| d|d}
| |
dddddf d  } |dkr| ddd|ddf d | ddd|ddf< | ddddd|f d | ddddd|f< tj| |d|ddd	d
}|j}|dkr.|j	j
ddj	}| |ddddf k }n|dkr>d}nt| d||fS )zZ
    Compute the indices of the subset of components to be used in the approximation.
    r   r/   rG   )ZdiagonalNg     @r5   TF)r4   largestsortedfullr3   sparsez# is not a valid approx_model value.)rH   rI   r}   rG   ZtrilZtriuZtopkrL   r?   r:   minr~   r8   )r   
num_blocksapprox_modeinitial_prior_first_n_blocksinitial_prior_diagonal_n_blocksrO   Ztotal_blocks_per_rowrY   offsetZ	temp_maskZdiagonal_maskZ
top_k_valsr?   	thresholdhigh_resolution_maskr    r    r$   get_block_idxesX  s4    
    

r   c	           $   	   C   s  t dkrt|  S |  \}	}
}}|	|
 }|| dkrBtd|| }| |||} ||||}||||}|dk	r| |dddddf  } ||dddddf  }||dddddf  }|dkrt| ||||\}}}}n>|dkr"t  t| |||\}}}}W 5 Q R X nt	dt " || }t
|||||\}}W 5 Q R X tj| |||dt| }t||||\}}|| }|dk	r|dd	t||dddddddf    }t|}t||||}t||||}|dkrjt|| d|  |dddddf  }t||dddddddf d	d	|d	|||}|jd
ddddddf d	d	|||}|d	d	|||| } |dk	r| | } t| | dk  }!||!dddddf  }||! }t|  | dk  }"||"dddddf  }||" }|| |dddddf |dddddf  d  }#n2|dkr||dddddf d  }#nt	d|dk	r|#|dddddf  }#|#|	|
||}#|#S )z0
    Use Mra to approximate self-attention.
    Nr   z4sequence length must be divisible by the block_size.r   r   z&approx_mode must be "full" or "sparse")rN   r|   r   r5   r3   r{   z-config.approx_mode must be "full" or "sparse")r(   rI   Z
zeros_likeZrequires_grad_r7   r8   rL   r   Zno_grad	Exceptionr   rb   ro   r   r   rD   rS   expru   rv   r   repeatrw   r~   )$r   r   r   rM   r   r   rN   r   r   rO   Znum_headrP   r   Z
meta_batchr   r   r   r   r   rY   Zlow_resolution_logit_normalizedr?   r   Zhigh_resolution_logitrB   rC   Zhigh_resolution_attnZhigh_resolution_attn_outZhigh_resolution_normalizerZlow_resolution_attnZlow_resolution_attn_outZlow_resolution_normalizerZlog_correctionZlow_resolution_corrZhigh_resolution_corrcontext_layerr    r    r$   mra2_attention~  s        

   
   
.
      
$     .
.
 
r   c                       s*   e Zd ZdZ fddZdddZ  ZS )MraEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|jd |j| _	t|j
|j| _tj|j|jd| _t|j| _| dt|jdd  t|dd| _| jdtj| j tj| jjd	d
d d S )N)padding_idxr/   Zepsposition_ids)r   r5   position_embedding_typeabsolutetoken_type_idsrE   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizeZpad_token_idword_embeddingsmax_position_embeddingsposition_embeddingsZtype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutZregister_bufferrI   rJ   expandgetattrr   rx   r   r7   rK   rG   selfconfig	__class__r    r$   r     s    
zMraEmbeddings.__init__Nc                 C   s   |d k	r|  }n|  d d }|d }|d krH| jd d d |f }|d krt| dr| jd d d |f }||d |}|}ntj|tj| jjd}|d kr| 	|}| 
|}	||	 }
| jdkr| |}|
|7 }
| |
}
| |
}
|
S )Nr5   r   r   r   rE   r   )r7   r   hasattrr   r   rI   rx   rK   rG   r   r   r   r   r   r   )r   	input_idsr   r   inputs_embedsinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   r    r    r$   rf     s,    







zMraEmbeddings.forward)NNNNrq   rr   rs   __doc__r   rf   __classcell__r    r    r   r$   r     s   r   c                       s0   e Zd Zd fdd	Zdd Zd	ddZ  ZS )
MraSelfAttentionNc                    s  t    |j|j dkr>t|ds>td|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _|d k	r|n|j| _|jd |j | _t| jt|jd d | _|j| _|j| _|j| _d S )Nr   Zembedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r1   r/   )r   r   r   num_attention_headsr   r8   r=   attention_head_sizeall_head_sizer   Linearr   r   r   r   Zattention_probs_dropout_probr   r   r   Zblock_per_rowrQ   r   r   r   r   r   r   r   r   r    r$   r   ,  s&    
zMraSelfAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )Nr5   r   r/   r   r   )r7   r   r   viewpermute)r   layerZnew_layer_shaper    r    r$   transpose_for_scoresH  s    
z%MraSelfAttention.transpose_for_scoresc              
   C   s  |  |}| | |}| | |}| |}| \}}}	}
d|d  }| d|d|| |	 }d}|
|k r|||	||
 f}t	j
|t	j||jdgdd}t	j
|t	j||jdgdd}t	j
|t	j||jdgdd}t| | | | | j| j| j| jd}|
|k rB|d d d d d d d |
f }||||	|
}|d	d
dd }| d d | jf }|j| }|f}|S )N      ?r|   r   r1   r   r5   r3   )r   r   r   r   r/   r   r2   )r   r   r   r   r7   squeezer   rL   r=   rI   catrx   rG   r   r~   rQ   r   r   r   r   r<   r   r   )r   hidden_statesattention_maskZmixed_query_layerZ	key_layerZvalue_layerZquery_layerrO   Z	num_headsrP   r   Zgpu_warp_sizeZpad_sizer   Znew_context_layer_shapeoutputsr    r    r$   rf   M  s@    

 
 
zMraSelfAttention.forward)N)N)rq   rr   rs   r   r   rf   r   r    r    r   r$   r   +  s   r   c                       s4   e Zd Z fddZejejejdddZ  ZS )MraSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr   )r   r   r   r   r   denser   r   r   r   r   r   r   r    r$   r     s    
zMraSelfOutput.__init__r   input_tensorreturnc                 C   s&   |  |}| |}| || }|S rc   r   r   r   r   r   r   r    r    r$   rf     s    

zMraSelfOutput.forwardrq   rr   rs   r   rI   Tensorrf   r   r    r    r   r$   r     s   r   c                       s0   e Zd Zd fdd	Zdd Zd	ddZ  ZS )
MraAttentionNc                    s.   t    t||d| _t|| _t | _d S )N)r   )r   r   r   r   r   rz   setpruned_headsr   r   r    r$   r     s    

zMraAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r3   )r6   r   r   r   r   r   r   r   r   r   rz   r   r   union)r   headsindexr    r    r$   prune_heads  s       zMraAttention.prune_headsc                 C   s2   |  ||}| |d |}|f|dd   }|S Nr   r   )r   rz   )r   r   r   Zself_outputsattention_outputr   r    r    r$   rf     s    zMraAttention.forward)N)N)rq   rr   rs   r   r   rf   r   r    r    r   r$   r     s   r   c                       s0   e Zd Z fddZejejdddZ  ZS )MraIntermediatec                    sB   t    t|j|j| _t|jt	r6t
|j | _n|j| _d S rc   )r   r   r   r   r   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   r   r    r$   r     s
    
zMraIntermediate.__init__r   r   c                 C   s   |  |}| |}|S rc   )r   r   r   r   r    r    r$   rf     s    

zMraIntermediate.forwardr   r    r    r   r$   r     s   r   c                       s4   e Zd Z fddZejejejdddZ  ZS )	MraOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r$   r     s    
zMraOutput.__init__r   c                 C   s&   |  |}| |}| || }|S rc   r   r   r    r    r$   rf     s    

zMraOutput.forwardr   r    r    r   r$   r     s   r   c                       s.   e Zd Z fddZdddZdd Z  ZS )	MraLayerc                    sB   t    |j| _d| _t|| _|j| _t|| _t	|| _
d S Nr   )r   r   chunk_size_feed_forwardseq_len_dimr   	attentionZadd_cross_attentionr   intermediater   rz   r   r   r    r$   r     s    


zMraLayer.__init__Nc                 C   sB   |  ||}|d }|dd  }t| j| j| j|}|f| }|S r   )r   r   feed_forward_chunkr   r   )r   r   r   Zself_attention_outputsr   r   layer_outputr    r    r$   rf     s       
zMraLayer.forwardc                 C   s   |  |}| ||}|S rc   )r   rz   )r   r   Zintermediate_outputr   r    r    r$   r     s    
zMraLayer.feed_forward_chunk)N)rq   rr   rs   r   rf   r   r   r    r    r   r$   r     s   	
r   c                       s&   e Zd Z fddZdddZ  ZS )	
MraEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r    )r   )r!   rY   r   r    r$   r%     s     z'MraEncoder.__init__.<locals>.<listcomp>F)	r   r   r   r   Z
ModuleListrangenum_hidden_layersr   gradient_checkpointingr   r   r   r$   r     s    
 zMraEncoder.__init__NFTc                 C   s   |rdnd }t | jD ]T\}}|r,||f }| jrX| jrXdd }	tjj|	|||}
n
|||}
|
d }q|rz||f }|stdd ||fD S t||dS )Nr    c                    s    fdd}|S )Nc                     s    |  S rc   r    )inputsmoduler    r$   custom_forward  s    zIMraEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr    )r  r  r    r  r$   create_custom_forward  s    z1MraEncoder.forward.<locals>.create_custom_forwardr   c                 s   s   | ]}|d k	r|V  qd S rc   r    )r!   vr    r    r$   	<genexpr>  s      z%MraEncoder.forward.<locals>.<genexpr>)last_hidden_stater   )		enumerater   r  ZtrainingrI   utils
checkpointtupler   )r   r   r   	head_maskoutput_hidden_statesreturn_dictZall_hidden_statesiZlayer_moduler  Zlayer_outputsr    r    r$   rf     s*    



zMraEncoder.forward)NNFTrq   rr   rs   r   rf   r   r    r    r   r$   r     s   	    r   c                       s0   e Zd Z fddZejejdddZ  ZS )MraPredictionHeadTransformc                    sV   t    t|j|j| _t|jtr6t	|j | _
n|j| _
tj|j|jd| _d S r   )r   r   r   r   r   r   r   r   r   r   transform_act_fnr   r   r   r   r    r$   r     s    
z#MraPredictionHeadTransform.__init__r   c                 C   s"   |  |}| |}| |}|S rc   )r   r  r   r   r    r    r$   rf   (  s    


z"MraPredictionHeadTransform.forwardr   r    r    r   r$   r    s   	r  c                       s$   e Zd Z fddZdd Z  ZS )MraLMPredictionHeadc                    sL   t    t|| _tj|j|jdd| _t	t
|j| _| j| j_d S )NF)bias)r   r   r  	transformr   r   r   r   decoder	ParameterrI   rx   r  r   r   r    r$   r   1  s
    

zMraLMPredictionHead.__init__c                 C   s   |  |}| |}|S rc   )r  r  r   r    r    r$   rf   >  s    

zMraLMPredictionHead.forwardr  r    r    r   r$   r  0  s   r  c                       s0   e Zd Z fddZejejdddZ  ZS )MraOnlyMLMHeadc                    s   t    t|| _d S rc   )r   r   r  predictionsr   r   r    r$   r   F  s    
zMraOnlyMLMHead.__init__)sequence_outputr   c                 C   s   |  |}|S rc   )r  )r   r  prediction_scoresr    r    r$   rf   J  s    
zMraOnlyMLMHead.forwardr   r    r    r   r$   r  E  s   r  c                   @   s.   e Zd ZdZeZdZdZdd Zd
ddZ	d	S )MraPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r   Tc                 C   s   t |tjr:|jjjd| jjd |jdk	r|jj	  nft |tj
rz|jjjd| jjd |jdk	r|jj|j 	  n&t |tjr|jj	  |jjd dS )zInitialize the weightsg        )r   ZstdNr   )r   r   r   weightdataZnormal_r   Zinitializer_ranger  Zzero_r   r   r   Zfill_)r   r  r    r    r$   _init_weightsZ  s    

z MraPreTrainedModel._init_weightsFc                 C   s   t |tr||_d S rc   )r   r   r  )r   r  r   r    r    r$   _set_gradient_checkpointingj  s    
z.MraPreTrainedModel._set_gradient_checkpointingN)F)
rq   rr   rs   r   r   config_classZbase_model_prefixZsupports_gradient_checkpointingr!  r"  r    r    r    r$   r  P  s   r  aF  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MraConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
ak	  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z]The bare MRA Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Z fddZdd Zdd Zdd Zee	d	e
eeed
deej eej eej eej eej eej ee ee eeef d	ddZ  ZS )MraModelc                    s2   t  | || _t|| _t|| _|   d S rc   )r   r   r   r   r   r   encoder	post_initr   r   r    r$   r     s
    

zMraModel.__init__c                 C   s   | j jS rc   r   r   r   r    r    r$   get_input_embeddings  s    zMraModel.get_input_embeddingsc                 C   s   || j _d S rc   r'  )r   r   r    r    r$   set_input_embeddings  s    zMraModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr%  r   r   r   )r   Zheads_to_pruner   r   r    r    r$   _prune_heads  s    zMraModel._prune_headsbatch_size, sequence_lengthr  output_typer#  N)	r   r   r   r   r  r   r  r  r   c	                 C   s  |d k	r|n| j j}|d k	r |n| j j}|d k	rB|d k	rBtdn@|d k	r`| || | }	n"|d k	rz| d d }	ntd|	\}
}|d k	r|jn|j}|d krtj|
|f|d}|d kr
t	| j
dr| j
jd d d |f }||
|}|}ntj|	tj|d}| ||	}| || j j}| j
||||d}| j|||||d}|d	 }|sl|f|d
d   S t||j|j|jdS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer5   z5You have to specify either input_ids or inputs_embedsr   r   rE   )r   r   r   r   )r   r  r  r  r   r   )r	  r   
attentionscross_attentions)r   r  use_return_dictr8   Z%warn_if_padding_and_no_attention_maskr7   rG   rI   r}   r   r   r   r   rx   rK   Zget_extended_attention_maskZget_head_maskr   r%  r   r   r0  r1  )r   r   r   r   r   r  r   r  r  r   rO   r   rG   r   r   Zextended_attention_maskZembedding_outputZencoder_outputsr  r    r    r$   rf     sZ    


zMraModel.forward)NNNNNNNN)rq   rr   rs   r   r)  r*  r,  r   MRA_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   rI   r   boolr   r   rf   r   r    r    r   r$   r$    s:   
        
r$  z1MRA Model with a `language modeling` head on top.c                       s   e Zd ZddgZ fddZdd Zdd Zee	d	e
eeed
deej eej eej eej eej eej eej ee ee eeef d
ddZ  ZS )MraForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                    s,   t  | t|| _t|| _|   d S rc   )r   r   r$  r   r  clsr&  r   r   r    r$   r     s    

zMraForMaskedLM.__init__c                 C   s
   | j jjS rc   r9  r  r  r(  r    r    r$   get_output_embeddings&  s    z$MraForMaskedLM.get_output_embeddingsc                 C   s   || j j_d S rc   r:  )r   Znew_embeddingsr    r    r$   set_output_embeddings)  s    z$MraForMaskedLM.set_output_embeddingsr-  r.  N
r   r   r   r   r  r   labelsr  r  r   c
              
   C   s   |	dk	r|	n| j j}	| j||||||||	d}
|
d }| |}d}|dk	rnt }||d| j j|d}|	s|f|
dd  }|dk	r|f| S |S t|||
j|
j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        Nr   r   r   r  r   r  r  r   r5   r   losslogitsr   r0  )
r   r2  r   r9  r   r   r   r   r   r0  )r   r   r   r   r   r  r   r>  r  r  r   r  r  Zmasked_lm_lossloss_fctrz   r    r    r$   rf   ,  s4    
zMraForMaskedLM.forward)	NNNNNNNNN)rq   rr   rs   Z_tied_weights_keysr   r;  r<  r   r3  r4  r   r5  r   r6  r   rI   r   r7  r   r   rf   r   r    r    r   r$   r8    s>   	         
r8  c                       s(   e Zd ZdZ fddZdd Z  ZS )MraClassificationHeadz-Head for sentence-level classification tasks.c                    sF   t    t|j|j| _t|j| _t|j|j	| _
|| _d S rc   )r   r   r   r   r   r   r   r   r   
num_labelsout_projr   r   r   r    r$   r   i  s
    
zMraClassificationHead.__init__c                 K   sR   |d d dd d f }|  |}| |}t| jj |}|  |}| |}|S )Nr   )r   r   r   r   r   rF  )r   featureskwargsxr    r    r$   rf   q  s    



zMraClassificationHead.forwardr   r    r    r   r$   rD  f  s   rD  zMRA Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.c                       s   e Zd Z fddZeedeee	e
dd	eej eej eej eej eej eej eej ee ee eee	f d
ddZ  ZS )
MraForSequenceClassificationc                    s4   t  | |j| _t|| _t|| _|   d S rc   )r   r   rE  r$  r   rD  
classifierr&  r   r   r    r$   r     s
    

z%MraForSequenceClassification.__init__r-  r.  Nr=  c
              
   C   sp  |	dk	r|	n| j j}	| j||||||||	d}
|
d }| |}d}|dk	r,| j jdkr| jdkrnd| j _n4| jdkr|jtjks|jtj	krd| j _nd| j _| j jdkrt
 }| jdkr|| | }n
|||}nN| j jdkrt }||d| j|d}n| j jdkr,t }|||}|	s\|f|
dd  }|dk	rX|f| S |S t|||
j|
jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr?  r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationr5   r@  )r   r2  r   rK  Zproblem_typerE  rF   rI   rK   r=   r	   r   r   r   r   r   r   r0  )r   r   r   r   r   r  r   r>  r  r  r   r  rB  rA  rC  rz   r    r    r$   rf     sR    



"


z$MraForSequenceClassification.forward)	NNNNNNNNN)rq   rr   rs   r   r   r3  r4  r   r5  r   r6  r   rI   r   r7  r   r   rf   r   r    r    r   r$   rJ  {  s8   	         
rJ  zMRA Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks.c                       s   e Zd Z fddZeedeee	e
dd	eej eej eej eej eej eej eej ee ee eee	f d
ddZ  ZS )
MraForMultipleChoicec                    sD   t  | t|| _t|j|j| _t|jd| _| 	  d S r   )
r   r   r$  r   r   r   r   pre_classifierrK  r&  r   r   r    r$   r     s
    
zMraForMultipleChoice.__init__z(batch_size, num_choices, sequence_lengthr.  Nr=  c
              
   C   s  |	dk	r|	n| j j}	|dk	r&|jd n|jd }
|dk	rJ|d|dnd}|dk	rh|d|dnd}|dk	r|d|dnd}|dk	r|d|dnd}|dk	r|d|d|dnd}| j||||||||	d}|d }|dddf }| |}t |}| 	|}|d|
}d}|dk	rJt
 }|||}|	sz|f|dd  }|dk	rv|f| S |S t|||j|jdS )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r5   r2   r?  r   r@  )r   r2  rH   r   r7   r   rM  r   ZReLUrK  r   r   r   r0  )r   r   r   r   r   r  r   r>  r  r  Znum_choicesr   Zhidden_stateZpooled_outputrB  Zreshaped_logitsrA  rC  rz   r    r    r$   rf     sN    



zMraForMultipleChoice.forward)	NNNNNNNNN)rq   rr   rs   r   r   r3  r4  r   r5  r   r6  r   rI   r   r7  r   r   rf   r   r    r    r   r$   rL    s8   
         
rL  zMRA Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.c                       s   e Zd Z fddZeedeee	e
dd	eej eej eej eej eej eej eej ee ee eee	f d
ddZ  ZS )
MraForTokenClassificationc                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S rc   )r   r   rE  r$  r   r   r   r   r   r   r   rK  r&  r   r   r    r$   r   3  s    
z"MraForTokenClassification.__init__r-  r.  Nr=  c
              
   C   s  |	dk	r|	n| j j}	| j||||||||	d}
|
d }| |}| |}d}|dk	rt }|dk	r|ddk}|d| j}t	||dt
|j|}|||}n||d| j|d}|	s|f|
dd  }|dk	r|f| S |S t|||
j|
jdS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr?  r   r5   r   r@  )r   r2  r   r   rK  r   r   rE  rI   whereZtensorignore_indexZtype_asr   r   r0  )r   r   r   r   r   r  r   r>  r  r  r   r  rB  rA  rC  Zactive_lossZactive_logitsZactive_labelsrz   r    r    r$   rf   >  sH    

  z!MraForTokenClassification.forward)	NNNNNNNNN)rq   rr   rs   r   r   r3  r4  r   r5  r   r6  r   rI   r   r7  r   r   rf   r   r    r    r   r$   rN  -  s8            
rN  zMRA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).c                       s   e Zd Z fddZeedeee	e
dd	eej eej eej eej eej eej eej eej ee ee eee	f dddZ  ZS )
MraForQuestionAnsweringc                    sB   t  | d|_|j| _t|| _t|j|j| _| 	  d S )Nr/   )
r   r   rE  r$  r   r   r   r   
qa_outputsr&  r   r   r    r$   r     s    
z MraForQuestionAnswering.__init__r-  r.  N)r   r   r   r   r  r   start_positionsend_positionsr  r  r   c              
   C   sB  |
dk	r|
n| j j}
| j|||||||	|
d}|d }| |}|jddd\}}|d}|d}d}|dk	r|dk	rt| dkr|d}t| dkr|d}|d}|d|}|d|}t	|d}|||}|||}|| d }|
s,||f|dd  }|dk	r(|f| S |S t
||||j|jd	S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nr?  r   r   r5   r3   )rP  r/   )rA  start_logits
end_logitsr   r0  )r   r2  r   rR  splitr   r6   r7   clampr   r   r   r0  )r   r   r   r   r   r  r   rS  rT  r  r  r   r  rB  rU  rV  Z
total_lossZignored_indexrC  Z
start_lossZend_lossrz   r    r    r$   rf     sN    








zMraForQuestionAnswering.forward)
NNNNNNNNNN)rq   rr   rs   r   r   r3  r4  r   r5  r   r6  r   rI   r   r7  r   r   rf   r   r    r    r   r$   rQ    s<             
rQ  )r1   )r1   )r1   )NN)r1   r   r   )Yr   r   pathlibr   typingr   r   r   rI   Ztorch.utils.checkpointr   Ztorch.nnr   r   r	   Ztorch.utils.cpp_extensionr
   Zactivationsr   Zmodeling_outputsr   r   r   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   r  r   r   r   r   r   r   Zconfiguration_mrar   Z
get_loggerrq   loggerr5  r6  Z_TOKENIZER_FOR_DOCZ!MRA_PRETRAINED_MODEL_ARCHIVE_LISTr-   r(   infor   ewarningrD   rS   rV   r\   ra   ZautogradFunctionrb   ru   rv   r   r   r   Moduler   r   r   r   r   r   r   r   r  r  r  r  ZMRA_START_DOCSTRINGr3  r$  r8  rD  rJ  rL  rN  rQ  r    r    r    r$   <module>   s     




(
(
(-   
s:V!3/lLTTN