U
    9%e                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZmZmZ ddlmZmZ dd	lmZmZmZmZmZmZ dd
lmZmZ ddlmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' e%(e)Z*dZ+dZ,dddgZ-dd Z.G dd dej/Z0G dd deZ1G dd dej/Z2G dd dej/Z3G dd dej/Z4G dd  d ej/Z5G d!d" d"ej/Z6G d#d$ d$ej/Z7G d%d& d&ej/Z8G d'd( d(ej/Z9G d)d* d*ej/Z:G d+d, d,ej/Z;d-Z<d.Z=e#d/e<G d0d1 d1e1Z>G d2d3 d3ej/Z?e#d4e<G d5d6 d6e1Z@G d7d8 d8ej/ZAe#d9e<G d:d; d;e1ZBe#d<e<G d=d> d>e1ZCe#d?e<G d@dA dAe1ZDe#dBe<G dCdD dDe1ZEdS )Ez PyTorch ConvBERT model.    N)
attrgetter)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNget_activation)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModelSequenceSummary)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )ConvBertConfigzYituTech/conv-bert-baser   zYituTech/conv-bert-medium-smallzYituTech/conv-bert-smallc                 C   s  zddl }W n  tk
r,   td  Y nX tj|}td|  |j	|}i }|D ]4\}}td| d|  |j
||}	|	||< q^ddd	d
dddd}
|jdkrd}nd}t|jD ]D}d| d|
d| d< d| d|
d| d< d| d|
d| d< d| d|
d| d< d| d|
d| d< d| d|
d| d< d| d |
d| d!< d| d"|
d| d#< d| d$|
d| d%< d| d&|
d| d'< d| d(|
d| d)< d| d*|
d| d+< d| d,|
d| d-< d| d.|
d| d/< d| d0|
d| d1< d| d2|
d| d3< d| d4|
d| d5< d| d6| d7|
d| d8< d| d6| d9|
d| d:< d| d;| d7|
d| d<< d| d;| d9|
d| d=< d| d>|
d| d?< d| d@|
d| dA< q|  D ]}|d }t|}|| }|
| }t|| }tdB| dC| dD |d7r|dEs|dFs|j}|dGr|ddHd}|dIr|dHdd}|dJr|dK}||_q| S )Lz'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape z"electra/embeddings/word_embeddingsz&electra/embeddings/position_embeddingsz(electra/embeddings/token_type_embeddingsz"electra/embeddings/LayerNorm/gammaz!electra/embeddings/LayerNorm/betaz!electra/embeddings_project/kernelzelectra/embeddings_project/bias)z!embeddings.word_embeddings.weightz%embeddings.position_embeddings.weightz'embeddings.token_type_embeddings.weightzembeddings.LayerNorm.weightzembeddings.LayerNorm.biaszembeddings_project.weightzembeddings_project.biasr   Zg_densedensezelectra/encoder/layer_z/attention/self/query/kernelzencoder.layer.z.attention.self.query.weightz/attention/self/query/biasz.attention.self.query.biasz/attention/self/key/kernelz.attention.self.key.weightz/attention/self/key/biasz.attention.self.key.biasz/attention/self/value/kernelz.attention.self.value.weightz/attention/self/value/biasz.attention.self.value.biasz./attention/self/conv_attn_key/depthwise_kernelz4.attention.self.key_conv_attn_layer.depthwise.weightz./attention/self/conv_attn_key/pointwise_kernelz4.attention.self.key_conv_attn_layer.pointwise.weightz"/attention/self/conv_attn_key/biasz(.attention.self.key_conv_attn_layer.biasz'/attention/self/conv_attn_kernel/kernelz(.attention.self.conv_kernel_layer.weightz%/attention/self/conv_attn_kernel/biasz&.attention.self.conv_kernel_layer.biasz&/attention/self/conv_attn_point/kernelz%.attention.self.conv_out_layer.weightz$/attention/self/conv_attn_point/biasz#.attention.self.conv_out_layer.biasz/attention/output/dense/kernelz.attention.output.dense.weightz!/attention/output/LayerNorm/gammaz".attention.output.LayerNorm.weightz/attention/output/dense/biasz.attention.output.dense.biasz /attention/output/LayerNorm/betaz .attention.output.LayerNorm.biasz/intermediate/z/kernelz.intermediate.dense.weightz/biasz.intermediate.dense.biasz/output/z.output.dense.weightz.output.dense.biasz/output/LayerNorm/gammaz.output.LayerNorm.weightz/output/LayerNorm/betaz.output.LayerNorm.biaszTF: z, PT:  z/intermediate/g_dense/kernelz/output/g_dense/kernelz/depthwise_kernel   z/pointwise_kernelz/conv_attn_key/bias)Z
tensorflowImportErrorloggererrorospathabspathinfotrainZlist_variablesZload_variable
num_groupsrangenum_hidden_layersZnamed_parametersr   torchZ
from_numpyendswithTpermute	unsqueezedata)modelconfigZtf_checkpoint_pathtfZtf_pathZ	init_varsZtf_datanameshapearrayZparam_mappingZgroup_dense_namejparam
param_nameZ	retrieverresultZtf_namevalue r>   m/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/convbert/modeling_convbert.pyload_tf_weights_in_convbert8   s   
	










































r@   c                       sR   e Zd ZdZ fddZdeej eej eej eej ejdddZ	  Z
S )	ConvBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _| jdt|jddd | jdtj| j tjddd d S )	N)padding_idxZepsposition_ids)r   r!   F)
persistenttoken_type_ids)dtype)super__init__r   	Embedding
vocab_sizeembedding_sizeZpad_token_idword_embeddingsZmax_position_embeddingsposition_embeddingsZtype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutZregister_bufferr-   ZarangeexpandzerosrD   sizelongselfr4   	__class__r>   r?   rI      s     
    zConvBertEmbeddings.__init__N)	input_idsrF   rD   inputs_embedsreturnc                 C   s   |d k	r|  }n|  d d }|d }|d krH| jd d d |f }|d krt| dr| jd d d |f }||d |}|}ntj|tj| jjd}|d kr| 	|}| 
|}	| |}
||	 |
 }| |}| |}|S )Nr!   r   rF   r   rG   device)rW   rD   hasattrrF   rU   r-   rV   rX   ra   rM   rN   rO   rP   rT   )rZ   r]   rF   rD   r^   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrN   rO   
embeddingsr>   r>   r?   forward   s(    






zConvBertEmbeddings.forward)NNNN)__name__
__module____qualname____doc__rI   r   r-   
LongTensorFloatTensorrh   __classcell__r>   r>   r[   r?   rA      s       rA   c                   @   s2   e Zd ZdZeZeZdZdZ	dd Z
d
ddZd	S )ConvBertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    convbertTc                 C   s   t |tjr:|jjjd| jjd |jdk	r|jj	  nft |tj
rz|jjjd| jjd |jdk	r|jj|j 	  n&t |tjr|jj	  |jjd dS )zInitialize the weights        ZmeanZstdNg      ?)
isinstancer   Linearweightr2   normal_r4   initializer_rangebiasZzero_rJ   rB   rP   Zfill_)rZ   moduler>   r>   r?   _init_weights   s    

z%ConvBertPreTrainedModel._init_weightsFc                 C   s   t |tr||_d S N)rt   ConvBertEncodergradient_checkpointing)rZ   rz   r=   r>   r>   r?   _set_gradient_checkpointing  s    
z3ConvBertPreTrainedModel._set_gradient_checkpointingN)F)ri   rj   rk   rl   r   config_classr@   Zload_tf_weightsZbase_model_prefixZsupports_gradient_checkpointingr{   r   r>   r>   r>   r?   rp      s   rp   c                       s4   e Zd ZdZ fddZejejdddZ  ZS )SeparableConv1DzSThis class implements separable convolution, i.e. a depthwise and a pointwise layerc                    s~   t    tj|||||d dd| _tj||ddd| _tt|d| _	| jj
jjd|jd | jj
jjd|jd d S )Nr    F)kernel_sizegroupspaddingry   r   )r   ry   rr   rs   )rH   rI   r   ZConv1d	depthwise	pointwise	Parameterr-   rV   ry   rv   r2   rw   rx   )rZ   r4   Zinput_filtersZoutput_filtersr   kwargsr[   r>   r?   rI     s    
zSeparableConv1D.__init__hidden_statesr_   c                 C   s"   |  |}| |}|| j7 }|S r|   )r   r   ry   )rZ   r   xr>   r>   r?   rh   #  s    


zSeparableConv1D.forward	ri   rj   rk   rl   rI   r-   Tensorrh   ro   r>   r>   r[   r?   r     s   r   c                
       sf   e Zd Z fddZdd Zd
ejeej eej eej ee	 e
ejeej f ddd	Z  ZS )ConvBertSelfAttentionc                    s`  t    |j|j dkr>t|ds>td|j d|j d|j|j }|dk rb|j| _d| _n|| _|j| _|j| _|j| j dkrtd|j| j d | _| j| j | _	t
|j| j	| _t
|j| j	| _t
|j| j	| _t||j| j	| j| _t
| j	| j| j | _t
|j| j	| _t
j| jdgt| jd d dgd	| _t
|j| _d S )
Nr   rL   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   z6hidden_size should be divisible by num_attention_headsr    )r   r   )rH   rI   hidden_sizenum_attention_headsrb   
ValueErrorZ
head_ratioconv_kernel_sizeattention_head_sizeall_head_sizer   ru   querykeyr=   r   key_conv_attn_layerconv_kernel_layerconv_out_layerZUnfoldintunfoldrR   Zattention_probs_dropout_probrT   )rZ   r4   Znew_num_attention_headsr[   r>   r?   rI   +  s@    
    zConvBertSelfAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )Nr!   r   r    r   r
   )rW   r   r   viewr0   )rZ   r   Znew_x_shaper>   r>   r?   transpose_for_scoresR  s    
z*ConvBertSelfAttention.transpose_for_scoresNFr   attention_mask	head_maskencoder_hidden_statesoutput_attentionsr_   c                 C   sV  |  |}|d}|d k	r2| |}| |}	n| |}| |}	| |dd}
|
dd}
| |}| |}| |	}t|
|}| 	|}t
|d| jdg}tj|dd}| |}t
||d| jg}|dd d}tjj|| jdgd| jd d dgdd}|dd
|d| j| j}t
|d| j| jg}t||}t
|d| jg}t||dd}|t| j }|d k	r|| }tjj|dd}| |}|d k	r|| }t||}|dddd }t
||d| j| jg}t||gd}| d d | j| j d f }|j| }|rL||fn|f}|S )	Nr   r   r    r!   dim)r   Zdilationr   Zstrider
   )r   rW   r   r=   r   Z	transposer   r-   multiplyr   reshaper   Zsoftmaxr   r   
contiguousr1   r   Z
functionalr   r   matmulmathsqrtrT   r0   r   catr   )rZ   r   r   r   r   r   Zmixed_query_layer
batch_sizeZmixed_key_layerZmixed_value_layerZmixed_key_conv_attn_layerZquery_layerZ	key_layerZvalue_layerZconv_attn_layerr   r   Zattention_scoresZattention_probsZcontext_layerZconv_outZnew_context_layer_shapeoutputsr>   r>   r?   rh   W  sj    









   



zConvBertSelfAttention.forward)NNNF)ri   rj   rk   rI   r   r-   r   r   rn   boolr   rh   ro   r>   r>   r[   r?   r   *  s   '    r   c                       s4   e Zd Z fddZejejejdddZ  ZS )ConvBertSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S NrC   )rH   rI   r   ru   r   r   rP   rQ   rR   rS   rT   rY   r[   r>   r?   rI     s    
zConvBertSelfOutput.__init__r   input_tensorr_   c                 C   s&   |  |}| |}| || }|S r|   r   rT   rP   rZ   r   r   r>   r>   r?   rh     s    

zConvBertSelfOutput.forwardri   rj   rk   rI   r-   r   rh   ro   r>   r>   r[   r?   r     s   r   c                
       sf   e Zd Z fddZdd Zd
ejeej eej eej ee	 e
ejeej f ddd	Z  ZS )ConvBertAttentionc                    s*   t    t|| _t|| _t | _d S r|   )rH   rI   r   rZ   r   outputsetpruned_headsrY   r[   r>   r?   rI     s    


zConvBertAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )lenr   rZ   r   r   r   r   r   r   r=   r   r   r   union)rZ   headsindexr>   r>   r?   prune_heads  s       zConvBertAttention.prune_headsNFr   c           	      C   s8   |  |||||}| |d |}|f|dd   }|S )Nr   r   )rZ   r   )	rZ   r   r   r   r   r   Zself_outputsattention_outputr   r>   r>   r?   rh     s    zConvBertAttention.forward)NNNF)ri   rj   rk   rI   r   r-   r   r   rn   r   r   rh   ro   r>   r>   r[   r?   r     s       r   c                       s0   e Zd Z fddZejejdddZ  ZS )GroupedLinearLayerc                    sj   t    || _|| _|| _| j| j | _| j| j | _tt	
| j| j| j| _tt	
|| _d S r|   )rH   rI   
input_sizeoutput_sizer*   group_in_dimZgroup_out_dimr   r   r-   emptyrv   ry   )rZ   r   r   r*   r[   r>   r?   rI     s    
zGroupedLinearLayer.__init__r   c                 C   sr   t | d }t|d| j| jg}|ddd}t|| j}|ddd}t||d| j	g}|| j
 }|S )Nr   r!   r   r    )listrW   r-   r   r*   r   r0   r   rv   r   ry   )rZ   r   r   r   r>   r>   r?   rh     s    
zGroupedLinearLayer.forwardr   r>   r>   r[   r?   r     s   
r   c                       s0   e Zd Z fddZejejdddZ  ZS )ConvBertIntermediatec                    sd   t    |jdkr(t|j|j| _nt|j|j|jd| _t	|j
trXt|j
 | _n|j
| _d S )Nr   r   r   r*   )rH   rI   r*   r   ru   r   intermediate_sizer   r   rt   
hidden_actstrr   intermediate_act_fnrY   r[   r>   r?   rI     s    

  zConvBertIntermediate.__init__r   c                 C   s   |  |}| |}|S r|   )r   r   rZ   r   r>   r>   r?   rh   	  s    

zConvBertIntermediate.forwardr   r>   r>   r[   r?   r     s   r   c                       s4   e Zd Z fddZejejejdddZ  ZS )ConvBertOutputc                    sd   t    |jdkr(t|j|j| _nt|j|j|jd| _tj	|j|j
d| _	t|j| _d S )Nr   r   rC   )rH   rI   r*   r   ru   r   r   r   r   rP   rQ   rR   rS   rT   rY   r[   r>   r?   rI     s    

  zConvBertOutput.__init__r   c                 C   s&   |  |}| |}| || }|S r|   r   r   r>   r>   r?   rh     s    

zConvBertOutput.forwardr   r>   r>   r[   r?   r     s   r   c                       sn   e Zd Z fddZd
ejeej eej eej eej ee e	ejeej f dddZ
dd	 Z  ZS )ConvBertLayerc                    sn   t    |j| _d| _t|| _|j| _|j| _| jrV| jsLt|  dt|| _	t
|| _t|| _d S )Nr   z> should be used as a decoder model if cross attention is added)rH   rI   chunk_size_feed_forwardseq_len_dimr   	attention
is_decoderadd_cross_attention	TypeErrorcrossattentionr   intermediater   r   rY   r[   r>   r?   rI   #  s    



zConvBertLayer.__init__NF)r   r   r   r   encoder_attention_maskr   r_   c                 C   s   | j ||||d}|d }|dd  }	| jrx|d k	rxt| dsNtd|  d| |||||}
|
d }|	|
dd   }	t| j| j| j|}|f|	 }	|	S )Nr   r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)	r   r   rb   AttributeErrorr   r   feed_forward_chunkr   r   )rZ   r   r   r   r   r   r   Zself_attention_outputsr   r   Zcross_attention_outputslayer_outputr>   r>   r?   rh   1  s<    	

   
zConvBertLayer.forwardc                 C   s   |  |}| ||}|S r|   )r   r   )rZ   r   Zintermediate_outputr   r>   r>   r?   r   Y  s    
z ConvBertLayer.feed_forward_chunk)NNNNF)ri   rj   rk   rI   r-   r   r   rn   r   r   rh   r   ro   r>   r>   r[   r?   r   "  s         (r   c                       sj   e Zd Z fddZd	ejeej eej eej eej ee ee ee e	e
ef d	ddZ  ZS )
r}   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r>   )r   ).0_r4   r>   r?   
<listcomp>c  s     z,ConvBertEncoder.__init__.<locals>.<listcomp>F)	rH   rI   r4   r   Z
ModuleListr+   r,   layerr~   rY   r[   r   r?   rI   `  s    
 zConvBertEncoder.__init__NFT)	r   r   r   r   r   r   output_hidden_statesreturn_dictr_   c	              	      s  |rdnd }	 rdnd }
 r(| j jr(dnd }t| jD ]\}}|rL|	|f }	|d k	r\|| nd }| jr| jr fdd}tjj|||||||}n|||||| }|d } r6|
|d f }
| j jr6||d f }q6|r|	|f }	|st	dd ||	|
|fD S t
||	|
|d	S )
Nr>   c                    s    fdd}|S )Nc                     s    | f S r|   r>   )inputs)rz   r   r>   r?   custom_forward}  s    zNConvBertEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr>   )rz   r   r   )rz   r?   create_custom_forward|  s    z6ConvBertEncoder.forward.<locals>.create_custom_forwardr   r   r    c                 s   s   | ]}|d k	r|V  qd S r|   r>   )r   vr>   r>   r?   	<genexpr>  s   z*ConvBertEncoder.forward.<locals>.<genexpr>)Zlast_hidden_stater   
attentionsZcross_attentions)r4   r   	enumerater   r~   Ztrainingr-   utils
checkpointtupler   )rZ   r   r   r   r   r   r   r   r   Zall_hidden_statesZall_self_attentionsZall_cross_attentionsiZlayer_moduleZlayer_head_maskr   Zlayer_outputsr>   r   r?   rh   f  sT    
	

zConvBertEncoder.forward)NNNNFFT)ri   rj   rk   rI   r-   r   r   rn   r   r   r   r   rh   ro   r>   r>   r[   r?   r}   _  s&   	       
r}   c                       s0   e Zd Z fddZejejdddZ  ZS )ConvBertPredictionHeadTransformc                    sV   t    t|j|j| _t|jtr6t	|j | _
n|j| _
tj|j|jd| _d S r   )rH   rI   r   ru   r   r   rt   r   r   r   transform_act_fnrP   rQ   rY   r[   r>   r?   rI     s    
z(ConvBertPredictionHeadTransform.__init__r   c                 C   s"   |  |}| |}| |}|S r|   )r   r   rP   r   r>   r>   r?   rh     s    


z'ConvBertPredictionHeadTransform.forwardr   r>   r>   r[   r?   r     s   	r   aK  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`ConvBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a8
  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:


            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:


            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:


            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zbThe bare ConvBERT Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Z fddZdd Zdd Zdd Zee	d	e
eeed
deej eej eej eej eej eej ee ee ee eeef d
ddZ  ZS )ConvBertModelc                    sP   t  | t|| _|j|jkr4t|j|j| _t	|| _
|| _|   d S r|   )rH   rI   rA   rg   rL   r   r   ru   embeddings_projectr}   encoderr4   	post_initrY   r[   r>   r?   rI      s    

zConvBertModel.__init__c                 C   s   | j jS r|   rg   rM   rZ   r>   r>   r?   get_input_embeddings  s    z"ConvBertModel.get_input_embeddingsc                 C   s   || j _d S r|   r   )rZ   r=   r>   r>   r?   set_input_embeddings  s    z"ConvBertModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )rZ   Zheads_to_pruner   r   r>   r>   r?   _prune_heads  s    zConvBertModel._prune_headsbatch_size, sequence_lengthr   output_typer   N)
r]   r   rF   rD   r   r^   r   r   r   r_   c
                 C   sz  |d k	r|n| j j}|d k	r |n| j j}|	d k	r4|	n| j j}	|d k	rV|d k	rVtdn@|d k	rt| || | }
n"|d k	r| d d }
ntd|
\}}|d k	r|jn|j}|d krtj	|
|d}|d krt
| jdr
| jjd d d |f }|||}|}ntj|
tj|d}| ||
}| || j j}| j||||d}t
| dr`| |}| j||||||	d	}|S )
NzDYou cannot specify both input_ids and inputs_embeds at the same timer!   z5You have to specify either input_ids or inputs_embeds)ra   rF   r`   )r]   rD   rF   r^   r   )r   r   r   r   r   )r4   r   r   use_return_dictr   Z%warn_if_padding_and_no_attention_maskrW   ra   r-   Zonesrb   rg   rF   rU   rV   rX   Zget_extended_attention_maskZget_head_maskr,   r   r   )rZ   r]   r   rF   rD   r   r^   r   r   r   rc   r   rd   ra   re   rf   Zextended_attention_maskr   r>   r>   r?   rh     sR    


   
	zConvBertModel.forward)	NNNNNNNNN)ri   rj   rk   rI   r   r   r   r   CONVBERT_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r-   rm   rn   r   r   r   rh   ro   r>   r>   r[   r?   r     s>            
r   c                       s4   e Zd ZdZ fddZejejdddZ  ZS )ConvBertGeneratorPredictionszAPrediction module for the generator, made up of two dense layers.c                    s4   t    tj|j|jd| _t|j|j| _d S r   )	rH   rI   r   rP   rL   rQ   ru   r   r   rY   r[   r>   r?   rI   b  s    
z%ConvBertGeneratorPredictions.__init__)generator_hidden_statesr_   c                 C   s$   |  |}td|}| |}|S )NZgelu)r   r   rP   )rZ   r  r   r>   r>   r?   rh   h  s    

z$ConvBertGeneratorPredictions.forward)	ri   rj   rk   rl   rI   r-   rn   rh   ro   r>   r>   r[   r?   r  _  s   r  z6ConvBERT Model with a `language modeling` head on top.c                       s   e Zd ZdgZ fddZdd Zdd Zee	de
eeed	deej eej eej eej eej eej eej ee ee ee eeef dddZ  ZS )ConvBertForMaskedLMzgenerator.lm_head.weightc                    s>   t  | t|| _t|| _t|j|j	| _
|   d S r|   )rH   rI   r   rq   r  generator_predictionsr   ru   rL   rK   generator_lm_headr   rY   r[   r>   r?   rI   t  s
    

zConvBertForMaskedLM.__init__c                 C   s   | j S r|   r  r   r>   r>   r?   get_output_embeddings~  s    z)ConvBertForMaskedLM.get_output_embeddingsc                 C   s
   || _ d S r|   r	  )rZ   rM   r>   r>   r?   set_output_embeddings  s    z)ConvBertForMaskedLM.set_output_embeddingsr   r   Nr]   r   rF   rD   r   r^   labelsr   r   r   r_   c                 C   s   |
dk	r|
n| j j}
| ||||||||	|
	}|d }| |}| |}d}|dk	rzt }||d| j j|d}|
s|f|dd  }|dk	r|f| S |S t	|||j
|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r!   r   losslogitsr   r   )r4   r   rq   r  r  r   r   r   rK   r   r   r   )rZ   r]   r   rF   rD   r   r^   r  r   r   r   r  Zgenerator_sequence_outputZprediction_scoresr  loss_fctr   r>   r>   r?   rh     s8    

zConvBertForMaskedLM.forward)
NNNNNNNNNN)ri   rj   rk   Z_tied_weights_keysrI   r
  r  r   r   r  r   r  r   r  r   r-   rm   rn   r   r   r   rh   ro   r>   r>   r[   r?   r  p  sB   
          
r  c                       s4   e Zd ZdZ fddZejejdddZ  ZS )ConvBertClassificationHeadz-Head for sentence-level classification tasks.c                    sZ   t    t|j|j| _|jd k	r,|jn|j}t|| _	t|j|j
| _|| _d S r|   )rH   rI   r   ru   r   r   classifier_dropoutrS   rR   rT   
num_labelsout_projr4   rZ   r4   r  r[   r>   r?   rI     s    
z#ConvBertClassificationHead.__init__r   c                 K   sR   |d d dd d f }|  |}| |}t| jj |}|  |}| |}|S )Nr   )rT   r   r   r4   r   r  )rZ   r   r   r   r>   r>   r?   rh     s    



z"ConvBertClassificationHead.forwardr   r>   r>   r[   r?   r    s   r  z
    ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       s   e Zd Z fddZeedeee	e
dd	eej eej eej eej eej eej eej ee ee ee eee	f dddZ  ZS )
!ConvBertForSequenceClassificationc                    s:   t  | |j| _|| _t|| _t|| _|   d S r|   )	rH   rI   r  r4   r   rq   r  
classifierr   rY   r[   r>   r?   rI     s    

z*ConvBertForSequenceClassification.__init__r   r   Nr  c                 C   sr  |
dk	r|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dk	r.| j jdkr| jdkrpd| j _n4| jdkr|jtjks|jtj	krd| j _nd| j _| j jdkrt
 }| jdkr|| | }n
|||}nN| j jdkrt }||d| j|d}n| j jdkr.t }|||}|
s^|f|dd  }|dk	rZ|f| S |S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   rF   rD   r   r^   r   r   r   r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationr!   r  )r4   r   rq   r  Zproblem_typer  rG   r-   rX   r   r	   squeezer   r   r   r   r   r   rZ   r]   r   rF   rD   r   r^   r  r   r   r   r   sequence_outputr  r  r  r   r>   r>   r?   rh     sT    



"


z)ConvBertForSequenceClassification.forward)
NNNNNNNNNN)ri   rj   rk   rI   r   r   r  r   r  r   r  r   r-   rm   rn   r   r   r   rh   ro   r>   r>   r[   r?   r    s<   
          
r  z
    ConvBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       s   e Zd Z fddZeedeee	e
dd	eej eej eej eej eej eej eej ee ee ee eee	f dddZ  ZS )
ConvBertForMultipleChoicec                    s<   t  | t|| _t|| _t|jd| _	| 
  d S )Nr   )rH   rI   r   rq   r   sequence_summaryr   ru   r   r  r   rY   r[   r>   r?   rI   @  s
    

z"ConvBertForMultipleChoice.__init__z(batch_size, num_choices, sequence_lengthr   Nr  c                 C   st  |
dk	r|
n| j j}
|dk	r&|jd n|jd }|dk	rJ|d|dnd}|dk	rh|d|dnd}|dk	r|d|dnd}|dk	r|d|dnd}|dk	r|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|dk	r0t }|||}|
s`|f|dd  }|dk	r\|f| S |S t	|||j
|jdS )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r!   r   r  r   r  )r4   r   r7   r   rW   rq   r  r  r   r   r   r   )rZ   r]   r   rF   rD   r   r^   r  r   r   r   Znum_choicesr   r  Zpooled_outputr  Zreshaped_logitsr  r  r   r>   r>   r?   rh   J  sL    



z!ConvBertForMultipleChoice.forward)
NNNNNNNNNN)ri   rj   rk   rI   r   r   r  r   r  r   r  r   r-   rm   rn   r   r   r   rh   ro   r>   r>   r[   r?   r  8  s@   
          
r  z
    ConvBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       s   e Zd Z fddZeedeee	e
dd	eej eej eej eej eej eej eej ee ee ee eee	f dddZ  ZS )
ConvBertForTokenClassificationc                    s^   t  | |j| _t|| _|jd k	r.|jn|j}t|| _	t
|j|j| _|   d S r|   )rH   rI   r  r   rq   r  rS   r   rR   rT   ru   r   r  r   r  r[   r>   r?   rI     s    
z'ConvBertForTokenClassification.__init__r   r   Nr  c                 C   s   |
dk	r|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dk	rxt }||d| j|d}|
s|f|dd  }|dk	r|f| S |S t|||j	|j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r!   r   r  )r4   r   rq   rT   r  r   r   r  r   r   r   r  r>   r>   r?   rh     s8    

z&ConvBertForTokenClassification.forward)
NNNNNNNNNN)ri   rj   rk   rI   r   r   r  r   r  r   r  r   r-   rm   rn   r   r   r   rh   ro   r>   r>   r[   r?   r    s<             
r  z
    ConvBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       s   e Zd Z fddZeedeee	e
dd	eej eej eej eej eej eej eej eej ee ee ee eee	f dddZ  ZS )
ConvBertForQuestionAnsweringc                    s<   t  | |j| _t|| _t|j|j| _| 	  d S r|   )
rH   rI   r  r   rq   r   ru   r   
qa_outputsr   rY   r[   r>   r?   rI     s
    
z%ConvBertForQuestionAnswering.__init__r   r   N)r]   r   rF   rD   r   r^   start_positionsend_positionsr   r   r   r_   c                 C   sP  |dk	r|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d }|d }d}|dk	r|dk	rt| dkr|d}t| dkr|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s:||f|dd  }|dk	r6|f| S |S t||||j|jd	S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nr  r   r   r!   r   )Zignore_indexr    )r  start_logits
end_logitsr   r   )r4   r   rq   r!  splitr  r   r   rW   clampr   r   r   r   )rZ   r]   r   rF   rD   r   r^   r"  r#  r   r   r   r   r  r  r$  r%  Z
total_lossZignored_indexr  Z
start_lossZend_lossr   r>   r>   r?   rh     sP    






z$ConvBertForQuestionAnswering.forward)NNNNNNNNNNN)ri   rj   rk   rI   r   r   r  r   r  r   r  r   r-   rm   rn   r   r   r   rh   ro   r>   r>   r[   r?   r     s@   
           
r   )Frl   r   r%   operatorr   typingr   r   r   r-   Ztorch.utils.checkpointr   Ztorch.nnr   r   r	   Zactivationsr   r   Zmodeling_outputsr   r   r   r   r   r   Zmodeling_utilsr   r   Zpytorch_utilsr   r   r   r   r   r   r   r   Zconfiguration_convbertr   Z
get_loggerri   r#   r  r  Z&CONVBERT_PRETRAINED_MODEL_ARCHIVE_LISTr@   ModulerA   rp   r   r   r   r   r   r   r   r   r}   r   ZCONVBERT_START_DOCSTRINGr   r   r  r  r  r  r  r  r   r>   r>   r>   r?   <module>   s    
|<  -=K5`PXVJ