U
    ,-eq                     @   s\  d Z ddlZddlmZ ddlmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZmZmZ ddlmZmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddlmZ ee Z!dZ"dZ#ddddgZ$dZ%dZ&ddddddgZ'eG dd deZ(dEe	j)e*e+e	j)dddZ,G dd  d ej-Z.G d!d" d"ej-Z/G d#d$ d$ej-Z0G d%d& d&ej-Z1G d'd( d(ej-Z2G d)d* d*ej-Z3G d+d, d,ej-Z4G d-d. d.ej-Z5G d/d0 d0ej-Z6G d1d2 d2ej-Z7G d3d4 d4ej-Z8G d5d6 d6ej-Z9G d7d8 d8ej-Z:G d9d: d:ej-Z;G d;d< d<eZ<d=Z=d>Z>ed?e=G d@dA dAe<Z?edBe=G dCdD dDe<Z@dS )Fz PyTorch CvT model.    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forward)$ImageClassifierOutputWithNoAttentionModelOutput)PreTrainedModel find_pruneable_heads_and_indicesprune_linear_layer)logging   )	CvtConfigr   zmicrosoft/cvt-13i     ztabby, tabby catzmicrosoft/cvt-13-384zmicrosoft/cvt-13-384-22kzmicrosoft/cvt-21zmicrosoft/cvt-21-384zmicrosoft/cvt-21-384-22kc                   @   sD   e Zd ZU dZdZejed< dZejed< dZ	e
eej  ed< dS )BaseModelOutputWithCLSTokena  
    Base class for model's outputs, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        cls_token_value (`torch.FloatTensor` of shape `(batch_size, 1, hidden_size)`):
            Classification token at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
    Nlast_hidden_statecls_token_valuehidden_states)__name__
__module____qualname____doc__r   torchZFloatTensor__annotations__r   r   r   r    r!   r!   e/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/cvt/modeling_cvt.pyr   ;   s   
r           F)input	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r#   r   r   )r   )dtypedevice)shapendimr   Zrandr(   r)   Zfloor_div)r$   r%   r&   Z	keep_probr*   Zrandom_tensoroutputr!   r!   r"   	drop_pathQ   s    
r.   c                       sP   e Zd ZdZdee dd fddZejejdddZ	e
d	d
dZ  ZS )CvtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).N)r%   r'   c                    s   t    || _d S N)super__init__r%   )selfr%   	__class__r!   r"   r2   i   s    
zCvtDropPath.__init__)r   r'   c                 C   s   t || j| jS r0   )r.   r%   r&   )r3   r   r!   r!   r"   forwardm   s    zCvtDropPath.forward)r'   c                 C   s   d | jS )Nzp={})formatr%   )r3   r!   r!   r"   
extra_reprp   s    zCvtDropPath.extra_repr)N)r   r   r   r   r   floatr2   r   Tensorr6   strr8   __classcell__r!   r!   r4   r"   r/   f   s   r/   c                       s(   e Zd ZdZ fddZdd Z  ZS )CvtEmbeddingsz'
    Construct the CvT embeddings.
    c                    s.   t    t|||||d| _t|| _d S )N)
patch_sizenum_channels	embed_dimstridepadding)r1   r2   CvtConvEmbeddingsconvolution_embeddingsr   Dropoutdropout)r3   r>   r?   r@   rA   rB   dropout_rater4   r!   r"   r2   y   s    
    zCvtEmbeddings.__init__c                 C   s   |  |}| |}|S r0   )rD   rF   )r3   pixel_valueshidden_stater!   r!   r"   r6      s    

zCvtEmbeddings.forwardr   r   r   r   r2   r6   r<   r!   r!   r4   r"   r=   t   s   r=   c                       s(   e Zd ZdZ fddZdd Z  ZS )rC   z"
    Image to Conv Embedding.
    c                    sP   t    t|tjjr|n||f}|| _tj|||||d| _	t
|| _d S )N)kernel_sizerA   rB   )r1   r2   
isinstancecollectionsabcIterabler>   r   Conv2d
projection	LayerNormnormalization)r3   r>   r?   r@   rA   rB   r4   r!   r"   r2      s
    
zCvtConvEmbeddings.__init__c                 C   sf   |  |}|j\}}}}|| }||||ddd}| jrH| |}|ddd||||}|S Nr      r   )rQ   r*   viewpermuterS   )r3   rH   
batch_sizer?   heightwidthhidden_sizer!   r!   r"   r6      s    

zCvtConvEmbeddings.forwardrJ   r!   r!   r4   r"   rC      s   rC   c                       s$   e Zd Z fddZdd Z  ZS )CvtSelfAttentionConvProjectionc              	      s4   t    tj|||||d|d| _t|| _d S )NF)rK   rB   rA   biasgroups)r1   r2   r   rP   convolutionZBatchNorm2drS   )r3   r@   rK   rB   rA   r4   r!   r"   r2      s    
	z'CvtSelfAttentionConvProjection.__init__c                 C   s   |  |}| |}|S r0   )r_   rS   r3   rI   r!   r!   r"   r6      s    

z&CvtSelfAttentionConvProjection.forwardr   r   r   r2   r6   r<   r!   r!   r4   r"   r\      s   r\   c                   @   s   e Zd Zdd ZdS ) CvtSelfAttentionLinearProjectionc                 C   s2   |j \}}}}|| }||||ddd}|S rT   )r*   rV   rW   )r3   rI   rX   r?   rY   rZ   r[   r!   r!   r"   r6      s    z(CvtSelfAttentionLinearProjection.forwardN)r   r   r   r6   r!   r!   r!   r"   rb      s   rb   c                       s&   e Zd Zd fdd	Zdd Z  ZS )CvtSelfAttentionProjectiondw_bnc                    s.   t    |dkr"t||||| _t | _d S )Nrd   )r1   r2   r\   convolution_projectionrb   linear_projection)r3   r@   rK   rB   rA   projection_methodr4   r!   r"   r2      s    
z#CvtSelfAttentionProjection.__init__c                 C   s   |  |}| |}|S r0   )re   rf   r`   r!   r!   r"   r6      s    

z"CvtSelfAttentionProjection.forward)rd   ra   r!   r!   r4   r"   rc      s   rc   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	CvtSelfAttentionTc                    s   t    |d | _|| _|| _|| _t|||||dkr<dn|d| _t|||||d| _t|||||d| _	t
j|||	d| _t
j|||	d| _t
j|||	d| _t
|
| _d S )Ng      avgZlinear)rg   )r]   )r1   r2   scalewith_cls_tokenr@   	num_headsrc   convolution_projection_queryconvolution_projection_keyconvolution_projection_valuer   Linearprojection_queryprojection_keyprojection_valuerE   rF   )r3   rl   r@   rK   	padding_q
padding_kvstride_q	stride_kvqkv_projection_methodqkv_biasattention_drop_raterk   kwargsr4   r!   r"   r2      s<    

        zCvtSelfAttention.__init__c                 C   s6   |j \}}}| j| j }|||| j|ddddS )Nr   rU   r   r
   )r*   r@   rl   rV   rW   )r3   rI   rX   r[   _head_dimr!   r!   r"   "rearrange_for_multi_head_attention   s    z3CvtSelfAttention.rearrange_for_multi_head_attentionc                 C   sT  | j r t|d|| gd\}}|j\}}}|ddd||||}| |}| |}	| |}
| j rtj	||	fdd}	tj	||fdd}tj	||
fdd}
| j
| j }| | |	}	| | |}| | |
}
td|	|g| j }tjjj|dd}| |}td||
g}|j\}}}}|dddd ||| j| }|S )	Nr   r   rU   dimzbhlk,bhtk->bhltzbhlt,bhtv->bhlvr
   )rk   r   splitr*   rW   rV   rn   rm   ro   catr@   rl   r~   rq   rr   rs   Zeinsumrj   r   Z
functionalZsoftmaxrF   
contiguous)r3   rI   rY   rZ   	cls_tokenrX   r[   r?   keyqueryvaluer}   Zattention_scoreZattention_probscontextr|   r!   r!   r"   r6      s,    



$zCvtSelfAttention.forward)T)r   r   r   r2   r~   r6   r<   r!   r!   r4   r"   rh      s    )rh   c                       s(   e Zd ZdZ fddZdd Z  ZS )CvtSelfOutputz
    The residual connection is defined in CvtLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    c                    s(   t    t||| _t|| _d S r0   )r1   r2   r   rp   denserE   rF   )r3   r@   	drop_rater4   r!   r"   r2      s    
zCvtSelfOutput.__init__c                 C   s   |  |}| |}|S r0   r   rF   r3   rI   Zinput_tensorr!   r!   r"   r6   %  s    

zCvtSelfOutput.forwardrJ   r!   r!   r4   r"   r     s   r   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	CvtAttentionTc                    s@   t    t|||||||||	|
|| _t||| _t | _d S r0   )r1   r2   rh   	attentionr   r-   setpruned_heads)r3   rl   r@   rK   rt   ru   rv   rw   rx   ry   rz   r   rk   r4   r!   r"   r2   ,  s     
zCvtAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )lenr   r   Znum_attention_headsZattention_head_sizer   r   r   r   r   r-   r   Zall_head_sizeunion)r3   headsindexr!   r!   r"   prune_headsL  s       zCvtAttention.prune_headsc                 C   s   |  |||}| ||}|S r0   )r   r-   )r3   rI   rY   rZ   Zself_outputattention_outputr!   r!   r"   r6   ^  s    zCvtAttention.forward)T)r   r   r   r2   r   r6   r<   r!   r!   r4   r"   r   +  s     r   c                       s$   e Zd Z fddZdd Z  ZS )CvtIntermediatec                    s.   t    t|t|| | _t | _d S r0   )r1   r2   r   rp   intr   ZGELU
activation)r3   r@   	mlp_ratior4   r!   r"   r2   e  s    
zCvtIntermediate.__init__c                 C   s   |  |}| |}|S r0   )r   r   r`   r!   r!   r"   r6   j  s    

zCvtIntermediate.forwardra   r!   r!   r4   r"   r   d  s   r   c                       s$   e Zd Z fddZdd Z  ZS )	CvtOutputc                    s0   t    tt|| || _t|| _d S r0   )r1   r2   r   rp   r   r   rE   rF   )r3   r@   r   r   r4   r!   r"   r2   q  s    
zCvtOutput.__init__c                 C   s    |  |}| |}|| }|S r0   r   r   r!   r!   r"   r6   v  s    

zCvtOutput.forwardra   r!   r!   r4   r"   r   p  s   r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )CvtLayerzb
    CvtLayer composed by attention layers, normalization and multi-layer perceptrons (mlps).
    Tc                    s|   t    t|||||||||	|
||| _t||| _t|||| _|dkrVt|dnt	
 | _t	|| _t	|| _d S )Nr#   )r%   )r1   r2   r   r   r   intermediater   r-   r/   r   Identityr.   rR   layernorm_beforelayernorm_after)r3   rl   r@   rK   rt   ru   rv   rw   rx   ry   rz   r   r   drop_path_raterk   r4   r!   r"   r2     s(    
zCvtLayer.__init__c                 C   sX   |  | |||}|}| |}|| }| |}| |}| ||}| |}|S r0   )r   r   r.   r   r   r-   )r3   rI   rY   rZ   Zself_attention_outputr   Zlayer_outputr!   r!   r"   r6     s    



zCvtLayer.forward)TrJ   r!   r!   r4   r"   r   }  s    'r   c                       s$   e Zd Z fddZdd Z  ZS )CvtStagec                    s   t     _|_jjj rBttddjj	d _t
 jj  jj jdkrh jn j	jd   j	j  jj  jj d_dd td jj  j| D tj fddt jj D  _d S )Nr   r   r   )r>   rA   r?   r@   rB   rG   c                 S   s   g | ]}|  qS r!   )item).0xr!   r!   r"   
<listcomp>  s     z%CvtStage.__init__.<locals>.<listcomp>c                    s   g | ]}t  jj  jj  jj  jj  jj  jj  jj  j	j  j
j  jj  jj j  jj  jj d qS ))rl   r@   rK   rt   ru   rw   rv   rx   ry   rz   r   r   r   rk   )r   rl   stager@   Z
kernel_qkvrt   ru   rw   rv   rx   ry   rz   r   r   r   )r   r|   configZdrop_path_ratesr3   r!   r"   r     s"   












)r1   r2   r   r   r   r   	Parameterr   Zrandnr@   r=   Zpatch_sizesZpatch_strider?   Zpatch_paddingr   	embeddingZlinspacer   depthZ
Sequentialrangelayers)r3   r   r   r4   r   r"   r2     s&    





	&zCvtStage.__init__c           	      C   s   d }|  |}|j\}}}}||||| ddd}| jj| j rh| j|dd}tj	||fdd}| j
D ]}||||}|}qn| jj| j rt|d|| gd\}}|ddd||||}||fS )Nr   rU   r   r   r   )r   r*   rV   rW   r   r   r   expandr   r   r   r   )	r3   rI   r   rX   r?   rY   rZ   layerZlayer_outputsr!   r!   r"   r6     s    

zCvtStage.forwardra   r!   r!   r4   r"   r     s   (r   c                       s&   e Zd Z fddZdddZ  ZS )
CvtEncoderc                    sF   t    || _tg | _tt|jD ]}| j	t
|| q*d S r0   )r1   r2   r   r   Z
ModuleListstagesr   r   r   appendr   )r3   r   Z	stage_idxr4   r!   r"   r2     s
    
zCvtEncoder.__init__FTc           	      C   sl   |rdnd }|}d }t | jD ]"\}}||\}}|r||f }q|s^tdd |||fD S t|||dS )Nr!   c                 s   s   | ]}|d k	r|V  qd S r0   r!   )r   vr!   r!   r"   	<genexpr>  s      z%CvtEncoder.forward.<locals>.<genexpr>r   r   r   )	enumerater   tupler   )	r3   rH   output_hidden_statesreturn_dictZall_hidden_statesrI   r   r|   Zstage_moduler!   r!   r"   r6     s    zCvtEncoder.forward)FTra   r!   r!   r4   r"   r     s   r   c                   @   s$   e Zd ZdZeZdZdZdd ZdS )CvtPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    cvtrH   c                 C   s   t |tjtjfrHtjj|jjd| jj	d|j_|j
dk	r|j
j  nlt |tjrp|j
j  |jjd nDt |tr| jj|j rtjjtdd| jjd d| jj	d|j_dS )zInitialize the weightsr#   )meanZstdNg      ?r   r   )rL   r   rp   rP   initZtrunc_normal_weightdatar   Zinitializer_ranger]   Zzero_rR   Zfill_r   r   r   r   Zzerosr@   )r3   moduler!   r!   r"   _init_weights"  s    

  z CvtPreTrainedModel._init_weightsN)	r   r   r   r   r   config_classZbase_model_prefixZmain_input_namer   r!   r!   r!   r"   r     s
   r   aE  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`CvtConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aE  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
            for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
z]The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.c                	       sl   e Zd Zd fdd	Zdd Zeeeee	e
deddeej ee ee eee	f d	d
dZ  ZS )CvtModelTc                    s(   t  | || _t|| _|   d S r0   )r1   r2   r   r   encoder	post_init)r3   r   add_pooling_layerr4   r!   r"   r2   O  s    
zCvtModel.__init__c                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r3   Zheads_to_pruner   r   r!   r!   r"   _prune_headsU  s    zCvtModel._prune_headsZvision)
checkpointoutput_typer   Zmodalityexpected_outputN)rH   r   r   r'   c                 C   sx   |d k	r|n| j j}|d k	r |n| j j}|d kr8td| j|||d}|d }|sf|f|dd   S t||j|jdS )Nz You have to specify pixel_valuesr   r   r   r   r   )r   r   use_return_dict
ValueErrorr   r   r   r   )r3   rH   r   r   Zencoder_outputssequence_outputr!   r!   r"   r6   ]  s$    zCvtModel.forward)T)NNN)r   r   r   r2   r   r   CVT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r   r:   boolr   r   r6   r<   r!   r!   r4   r"   r   J  s&   	   
r   z
    Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                
       sh   e Zd Z fddZeeeeee	e
ddeej eej ee ee eeef dddZ  ZS )	CvtForImageClassificationc                    sh   t  | |j| _t|dd| _t|jd | _|jdkrRt	|jd |jnt
 | _|   d S )NF)r   r   r   )r1   r2   
num_labelsr   r   r   rR   r@   	layernormrp   r   
classifierr   )r3   r   r4   r!   r"   r2     s    $z"CvtForImageClassification.__init__)r   r   r   r   N)rH   labelsr   r   r'   c                 C   s  |dk	r|n| j j}| j|||d}|d }|d }| j jd rL| |}n4|j\}}	}
}|||	|
| ddd}| |}|jdd}| 	|}d}|dk	r| j j
dkr| j jdkrd| j _
n6| j jdkr|jtjks|jtjkrd	| j _
nd
| j _
| j j
dkr>t }| j jdkr2|| | }n
|||}nP| j j
d	krpt }||d| j j|d}n| j j
d
krt }|||}|s|f|dd  }|dk	r|f| S |S t|||jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   r   rU   r   Z
regressionZsingle_label_classificationZmulti_label_classification)losslogitsr   )r   r   r   r   r   r*   rV   rW   r   r   Zproblem_typer   r(   r   longr   r	   Zsqueezer   r   r   r   )r3   rH   r   r   r   outputsr   r   rX   r?   rY   rZ   Zsequence_output_meanr   r   Zloss_fctr-   r!   r!   r"   r6     sL    



$

z!CvtForImageClassification.forward)NNNN)r   r   r   r2   r   r   r   _IMAGE_CLASS_CHECKPOINTr   r   _IMAGE_CLASS_EXPECTED_OUTPUTr   r   r:   r   r   r   r6   r<   r!   r!   r4   r"   r     s&       
r   )r#   F)Ar   collections.abcrM   dataclassesr   typingr   r   r   r   Ztorch.utils.checkpointr   Ztorch.nnr   r   r	   Z
file_utilsr   r   r   Zmodeling_outputsr   r   Zmodeling_utilsr   r   r   utilsr   Zconfiguration_cvtr   Z
get_loggerr   loggerr   r   r   r   r   Z!CVT_PRETRAINED_MODEL_ARCHIVE_LISTr   r:   r9   r   r.   Moduler/   r=   rC   r\   rb   rc   rh   r   r   r   r   r   r   r   r   ZCVT_START_DOCSTRINGr   r   r   r!   r!   r!   r"   <module>   sn   
	Q9B=6