U
    ,-e                     @   s  d Z ddlZddlZddlmZmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ eeZdZ dgZ!G dd dej"Z#dd Z$dd Z%G dd dej"Z&d9e
j'e(e)e
j'dddZ*G dd dej"Z+G dd dej"Z,G d d! d!ej"Z-G d"d# d#ej"Z.d$d% Z/d&d' Z0G d(d) d)ej"Z1G d*d+ d+ej"Z2ej"dd,d-d.Z3G d/d0 d0eZ4d1Z5d2Z6ed3e5G d4d5 d5e4Z7ed6e5G d7d8 d8e4eZ8dS ):z PyTorch ViTDet backbone.    N)DictListOptionalTupleUnion)nn   )ACT2FN)BackboneOutputBaseModelOutput)PreTrainedModel)add_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)BackboneMixin   )VitDetConfigr   zfacebook/vit-det-basec                       s<   e Zd ZdZ fddZdd ZejejdddZ  Z	S )	VitDetEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) to be consumed by a Transformer.
    c                    s   t    |j|j }}|j|j }}t|tjj	r8|n||f}t|tjj	rR|n||f}|d |d  |d |d   }|| _
|| _|| _|| _|jr|d }ttd||j| _nd | _tj||||d| _d S )Nr   r   )Zkernel_sizeZstride)super__init__Zpretrain_image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterable
image_sizenum_patchesZ use_absolute_position_embeddingsr   	Parametertorchzerosposition_embeddingsConv2d
projection)selfconfigr   r   r   r   r   Znum_positions	__class__ k/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/vitdet/modeling_vitdet.pyr   9   s    
 zVitDetEmbeddings.__init__c                 C   s   |r|ddddf }|j d }tt|}|| |krDtd||ksT||krtjj|d||d	dddd||fdd	d
}|	ddddS |d||dS dS )a  
        Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token dimension for the
        original embeddings.

        Args:
            abs_pos_embeddings (`torch.Tensor`):
                Absolute positional embeddings with (1, num_position, num_channels).
            has_cls_token (`bool`):
                If true, has 1 embedding in abs_pos_embeddings for cls token.
            height (`int`):
                Height of input image tokens.
            width (`int`):
                Width of input image tokens.

        Returns:
            Absolute positional embeddings after processing with shape (1, height, width, num_channels)
        Nr   z5Absolute position embeddings must be a square number.r   r      ZbicubicF)sizemodeZalign_corners)
shapeintmathsqrt
ValueErrorr   
functionalinterpolatereshapepermute)r&   Zabs_pos_embeddingsZhas_cls_tokenheightwidthZnum_positionr.   Znew_abs_pos_embeddingsr*   r*   r+   get_absolute_positionsO   s    
z'VitDetEmbeddings.get_absolute_positions)pixel_valuesreturnc                 C   s   |j d }|| jkr,td| j d| d| |}| jd k	r|dddd}|| | jd|j d |j d  }|dddd}|S )	Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r-   r   T)r0   r   r4   r%   r#   r8   r;   )r&   r<   r   
embeddingsr*   r*   r+   forwardt   s     



   zVitDetEmbeddings.forward)
__name__
__module____qualname____doc__r   r;   r!   Tensorr@   __classcell__r*   r*   r(   r+   r   3   s   %r   c                 C   s   t dt| | d }|jd |krftjj|d|jd dddd|dd}|d|dd}n|}t	| dddf t||  d }t	|dddf t| | d }|| |d t| | d  }||
  S )	a  
    Get relative positional embeddings according to the relative positions of query and key sizes.

    Args:
        q_size (`int`):
            Size of query q.
        k_size (`int`):
            Size of key k.
        rel_pos (`torch.Tensor`):
            Relative position embeddings (num_embeddings, num_channels).

    Returns:
        Extracted positional embeddings according to relative positions.
    r-   r   r   r,   Zlinear)r.   r/   N      ?)r1   maxr0   r   r5   r6   r7   r8   r!   Zarangelong)q_sizek_sizeZrel_posZmax_rel_distZrel_pos_resizedZq_coordsZk_coordsZrelative_coordsr*   r*   r+   get_rel_pos   s    $$rL   c                 C   s   |\}}|\}}	t |||}
t ||	|}|j\}}}|||||}td||
}
td||}| |||||	|
dddddddddf  |dddddddddf  ||| ||	 } | S )a  
    Calculate decomposed Relative Positional Embeddings as introduced in
    [MViT2](https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py).

    Args:
        attn (`torch.Tensor`):
            Attention map.
        queries (`torch.Tensor`):
            Query q in the attention layer with shape (batch_size, queries_height * queries_width, num_channels).
        rel_pos_h (`torch.Tensor`):
            Relative position embeddings (Lh, num_channels) for height axis.
        rel_pos_w (`torch.Tensor`):
            Relative position embeddings (Lw, num_channels) for width axis.
        q_size (`Tuple[int]`):
            Spatial sequence size of query q with (queries_height, queries_width).
        k_size (`Tuple[int]`]):
            Spatial sequence size of key k with (keys_height, keys_width).

    Returns:
        attn (Tensor): attention map with added relative positional embeddings.
    zbhwc,hkc->bhwkzbhwc,wkc->bhwkN)rL   r0   r7   r!   Zeinsumview)Zattnqueries	rel_pos_h	rel_pos_wrJ   rK   Zqueries_heightZqueries_widthZkeys_heightZ
keys_widthZrelative_heightZrelative_width
batch_size_dimZr_qZrelative_weightr*   r*   r+   !add_decomposed_relative_positions   s&        rT   c                       s,   e Zd ZdZd fdd	Zd	ddZ  ZS )
VitDetAttentionz=Multi-head Attention block with relative position embeddings.Nc                    s   t    |j}|j}|| _|| }|d | _tj||d |jd| _	t||| _
|j| _| jrttd|d  d || _ttd|d  d || _dS )z
        Args:
            config (`VitDetConfig`):
                Model configuration.
            input_size (`Tuple[int]`, *optional*):
                Input resolution, only required in case relative position embeddings are added.
        g      r   biasr-   r   r   N)r   r   r   Znum_attention_heads	num_headsscaler   LinearZqkv_biasqkvproj use_relative_position_embeddingsr    r!   r"   rO   rP   )r&   r'   
input_sizerS   rX   Zhead_dimr(   r*   r+   r      s    

 zVitDetAttention.__init__Fc                 C   s&  |j \}}}}| |||| d| jdddddd}|d|| j || dd\}}	}
|| j |	dd }| jrt	||| j
| j||f||f}|jdd}||
 }||| j||d}|ddddd}||||d}| |}|r||| j|j d |j d }||f}n|f}|S )	Nr   r,   r-   r   r      )rS   )r0   r[   r7   rX   r8   ZunbindrY   Z	transposer]   rT   rO   rP   ZsoftmaxrM   r\   )r&   hidden_stateoutput_attentionsrQ   r9   r:   rR   r[   rN   keysvaluesZattention_scoresZattention_probsoutputsr*   r*   r+   r@      s:    ,&     
   
zVitDetAttention.forward)N)FrA   rB   rC   rD   r   r@   rF   r*   r*   r(   r+   rU      s   rU           F)input	drop_probtrainingr=   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    rg   r   r   )r   )dtypedevice)r0   ndimr!   Zrandrk   rl   Zfloor_div)rh   ri   rj   Z	keep_probr0   Zrandom_tensoroutputr*   r*   r+   	drop_path  s    
rp   c                       sP   e Zd ZdZdee dd fddZejejdddZ	e
d	d
dZ  ZS )VitDetDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).N)ri   r=   c                    s   t    || _d S N)r   r   ri   )r&   ri   r(   r*   r+   r   .  s    
zVitDetDropPath.__init__)hidden_statesr=   c                 C   s   t || j| jS rr   )rp   ri   rj   )r&   rs   r*   r*   r+   r@   2  s    zVitDetDropPath.forwardr=   c                 C   s   d | jS )Nzp={})formatri   r&   r*   r*   r+   
extra_repr5  s    zVitDetDropPath.extra_repr)N)rA   rB   rC   rD   r   floatr   r!   rE   r@   strrw   rF   r*   r*   r(   r+   rq   +  s   rq   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )VitDetLayerNormaL  
    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and variance normalization over the
    channel dimension for inputs that have shape (batch_size, channels, height, width).
    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
    ư>c                    s@   t    tt|| _tt|| _|| _	|f| _
d S rr   )r   r   r   r    r!   Zonesweightr"   rW   epsnormalized_shape)r&   r~   r}   r(   r*   r+   r   @  s
    
zVitDetLayerNorm.__init__c                 C   sn   |j ddd}|| dj ddd}|| t|| j  }| jd d d d f | | jd d d d f  }|S )Nr   T)Zkeepdimr-   )meanpowr!   r3   r}   r|   rW   )r&   xusr*   r*   r+   r@   G  s
    ,zVitDetLayerNorm.forward)r{   rf   r*   r*   r(   r+   rz   9  s   rz   c                       s(   e Zd ZdZ fddZdd Z  ZS )VitDetResBottleneckBlockz
    The standard bottleneck residual block without the last activation layer. It contains 3 conv layers with kernels
    1x1, 3x3, 1x1.
    c                    s   t    tj||ddd| _t|| _t|j | _	tj||dddd| _
t|| _t|j | _tj||ddd| _t|| _dS )ar  
        Args:
            config (`VitDetConfig`):
                Model configuration.
            in_channels (`int`):
                Number of input channels.
            out_channels (`int`):
                Number of output channels.
            bottleneck_channels (`int`):
                Number of output channels for the 3x3 "bottleneck" conv layers.
        r   FrV   r   )paddingrW   N)r   r   r   r$   conv1rz   norm1r	   
hidden_actZact1conv2norm2Zact2conv3norm3)r&   r'   in_channelsout_channelsbottleneck_channelsr(   r*   r+   r   U  s    


z!VitDetResBottleneckBlock.__init__c                 C   s&   |}|   D ]}||}q|| }|S rr   )children)r&   r   outlayerr*   r*   r+   r@   m  s
    
z VitDetResBottleneckBlock.forwardrf   r*   r*   r(   r+   r   O  s   r   c                       s:   e Zd Zeedd fddZejejdddZ  ZS )	VitDetMlpN)in_featureshidden_featuresr=   c                    sD   t    t||| _t|j | _t||| _t	|j
| _d S rr   )r   r   r   rZ   fc1r	   r   actfc2ZDropoutZdropout_probdrop)r&   r'   r   r   r(   r*   r+   r   w  s
    
zVitDetMlp.__init__)r   r=   c                 C   s6   |  |}| |}| |}| |}| |}|S rr   )r   r   r   r   )r&   r   r*   r*   r+   r@   ~  s    




zVitDetMlp.forward)	rA   rB   rC   r1   r   r!   rE   r@   rF   r*   r*   r(   r+   r   v  s   r   c              	   C   s   | j \}}}}|||  | }|||  | }|dks>|dkrXtj| ddd|d|f} || ||  }}	| ||| ||	| ||} | dddddd d|||}
|
||	ffS )a  
    Partition into non-overlapping windows with padding if needed.

    Args:
        hidden_state (`torch.Tensor`):
            Input tokens with [batch_size, height, width, num_channels].
        window_size (`int`):
            Window size.

    Returns:
        `tuple(torch.FloatTensor)` comprising various elements:
        - windows: windows after partition with [batch_size * num_windows, window_size, window_size, num_channels].
        - (patch_height, patch_width): padded height and width before partition
    r   r   r   r-   r_      r,   )r0   r   r5   padrM   r8   
contiguous)ra   window_sizerQ   r9   r:   r   Z
pad_heightZ	pad_widthpatch_heightpatch_widthwindowsr*   r*   r+   window_partition  s          $r   c           
      C   s   |\}}|\}}| j d || | |  }| ||| || ||d}	|	dddddd |||d}	||ksz||kr|	ddd|d|ddf  }	|	S )	a@  
    Window unpartition into original sequences and removing padding.

    Args:
        windows (`torch.Tensor`):
            Input tokens with [batch_size * num_windows, window_size, window_size, num_channels].
        window_size (`int`):
            Window size.
        pad_height_width (`Tuple[int]`):
            Padded height and width (patch_height, patch_width).
        height_width (`Tuple[int]`):
            Original height and width before padding.

    Returns:
        hidden_state: unpartitioned sequences with [batch_size, height, width, num_channels].
    r   r,   r   r   r-   r_   r   N)r0   rM   r8   r   )
r   r   pad_height_widthZheight_widthr   r   r9   r:   rQ   ra   r*   r*   r+   window_unpartition  s         $$r   c                       sh   e Zd ZdZdeeeedd fddZde	j
ee	j
 eeee	j
e	j
f ee	j
 f dd	d
Z  ZS )VitDetLayerzCThis corresponds to the Block class in the original implementation.r   FN)r'   drop_path_rater   use_residual_blockr=   c                    s   t    |j}|j|j |j|j f}tj||jd| _t	||dkrJ|n||fd| _
|dkrht|nt | _tj||jd| _t||t||j d| _|| _|| _| jrt||||d d| _d S )N)r}   r   )r^   rg   )r'   r   r   r-   )r'   r   r   r   )r   r   r   r   r   r   	LayerNormZlayer_norm_epsr   rU   	attentionrq   ZIdentityrp   r   r   r1   Z	mlp_ratiomlpr   r   r   residual)r&   r'   r   r   r   rS   r^   r(   r*   r+   r     s(    
 zVitDetLayer.__init__)rs   	head_maskrb   r=   c           
      C   s   | dddd}|}| |}| jdkrN|jd |jd  }}t|| j\}}| j||d}|d }|dd  }	| jdkrt|| j|||f}|| | }|| | | 	| }| dddd}| j
r| |}|f|	 }	|	S )Nr   r-   r   r   rb   )r8   r   r   r0   r   r   r   rp   r   r   r   r   )
r&   rs   r   rb   Zshortcutr9   r:   r   Zself_attention_outputsre   r*   r*   r+   r@     s*    




zVitDetLayer.forward)r   r   F)NF)rA   rB   rC   rD   r   rx   r1   boolr   r!   rE   r   r   r   r@   rF   r*   r*   r(   r+   r     s&              r   c                	       sN   e Zd Zedd fddZd
ejeej eeee	e
ef ddd	Z  ZS )VitDetEncoderN)r'   r=   c              	      s   t    || _|j}dd td|j|D }g }t|D ]4}|t	||| ||j
kr^|jnd||jkd q<t|| _d| _d S )Nc                 S   s   g | ]}|  qS r*   )item).0r   r*   r*   r+   
<listcomp>  s     z*VitDetEncoder.__init__.<locals>.<listcomp>r   )r   r   r   F)r   r   r'   num_hidden_layersr!   Zlinspacer   rangeappendr   Zwindow_block_indicesr   Zresidual_block_indicesr   Z
ModuleListr   gradient_checkpointing)r&   r'   depthr   Zlayersir(   r*   r+   r     s     
	zVitDetEncoder.__init__FT)rs   r   rb   output_hidden_statesreturn_dictr=   c                    s   |rdnd } rdnd }t | jD ]\}}	|r8||f }|d k	rH|| nd }
| jr|| jr| fdd}tjj||	||
}n|	||
 }|d } r"||d f }q"|r||f }|stdd |||fD S t|||dS )	Nr*   c                    s    fdd}|S )Nc                     s    | f S rr   r*   )inputs)modulerb   r*   r+   custom_forward:  s    zLVitDetEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr*   )r   r   r   r   r+   create_custom_forward9  s    z4VitDetEncoder.forward.<locals>.create_custom_forwardr   r   c                 s   s   | ]}|d k	r|V  qd S rr   r*   )r   vr*   r*   r+   	<genexpr>P  s      z(VitDetEncoder.forward.<locals>.<genexpr>Zlast_hidden_staters   
attentions)		enumerater   r   rj   r!   utils
checkpointtupler   )r&   rs   r   rb   r   r   Zall_hidden_statesZall_self_attentionsr   Zlayer_moduleZlayer_head_maskr   Zlayer_outputsr*   r   r+   r@   &  s4    

zVitDetEncoder.forward)NFFT)rA   rB   rC   r   r   r!   rE   r   r   r   r   r   r@   rF   r*   r*   r(   r+   r     s       
r   r   r=   c                 C   s2   t jj| jddd | jdk	r.t j| jd dS )a  
    Initialize `module.weight` using the "MSRAFill" implemented in Caffe2. Also initializes `module.bias` to 0.

    Source: https://detectron2.readthedocs.io/en/latest/_modules/fvcore/nn/weight_init.html.

    Args:
        module (torch.nn.Module): module to initialize.
    Zfan_outZrelu)r/   ZnonlinearityNr   )r   initZkaiming_normal_r|   rW   Z	constant_r   r*   r*   r+   caffe2_msra_fillX  s    	
r   c                   @   sX   e Zd ZdZeZdZdZdZg Z	e
ejejejf ddddZdeedd
ddZdS )VitDetPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    Zvitdetr<   TNr   c                 C   s  t |tjtjfr\tjj|jjt	j
d| jjd|jj|j_|jdk	rX|jj  n@t |tjr|jj  |jjd nt |trtjj|jjt	j
d| jjd|jj|j_nt |tr&| jjr&tjj|jjt	j
d| jjd|j_tjj|jjt	j
d| jjd|j_nvt |tr|j|j|jfD ]}t| qB|j|jfD ] }|jjd |jj  q^|jjj  |jjj  dS )zInitialize the weightsrg   )r   ZstdNrG   ) r   r   rZ   r$   r   Ztrunc_normal_r|   datator!   Zfloat32r'   Zinitializer_rangerk   rW   Zzero_r   Zfill_r   r#   rU   r]   rO   rP   r   r   r   r   r   r   r   r   )r&   r   r   r*   r*   r+   _init_weightsr  sP      



z#VitDetPreTrainedModel._init_weightsF)r   valuer=   c                 C   s   t |tr||_d S rr   )r   r   r   )r&   r   r   r*   r*   r+   _set_gradient_checkpointing  s    
z1VitDetPreTrainedModel._set_gradient_checkpointing)F)rA   rB   rC   rD   r   config_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesr   r   rZ   r$   r   r   r   r   r   r*   r*   r*   r+   r   f  s    +r   aH  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`VitDetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aK  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
            for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z`The bare VitDet Transformer model outputting raw hidden-states without any specific head on top.c                       s   e Zd Zed fddZedddZeee	e f ddd	d
Z
eeeeeddeej eej ee ee ee eeef dddZ  ZS )VitDetModelr'   c                    s2   t  | || _t|| _t|| _|   d S rr   )r   r   r'   r   r?   r   encoder	post_initr&   r'   r(   r*   r+   r     s
    

zVitDetModel.__init__rt   c                 C   s   | j jS rr   r?   r%   rv   r*   r*   r+   get_input_embeddings  s    z VitDetModel.get_input_embeddingsN)heads_to_pruner=   c                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   Zprune_heads)r&   r   r   Zheadsr*   r*   r+   _prune_heads  s    zVitDetModel._prune_headsoutput_typer   )r<   r   rb   r   r   r=   c           	      C   s   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}|dkrLtd| || j j}| |}| j|||||d}|d }|s|f|dd  S t	||j
|jdS )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import VitDetConfig, VitDetModel
        >>> import torch

        >>> config = VitDetConfig()
        >>> model = VitDetModel(config)

        >>> pixel_values = torch.randn(1, 3, 224, 224)

        >>> with torch.no_grad():
        ...     outputs = model(pixel_values)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 768, 14, 14]
        ```Nz You have to specify pixel_values)r   rb   r   r   r   r   r   )r'   rb   r   use_return_dictr4   Zget_head_maskr   r?   r   r   rs   r   )	r&   r<   r   rb   r   r   embedding_outputZencoder_outputsZsequence_outputr*   r*   r+   r@     s.    
zVitDetModel.forward)NNNNN)rA   rB   rC   r   r   r   r   r   r1   r   r   r   VITDET_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr   r!   rE   r   r   r   r@   rF   r*   r*   r(   r+   r     s$   

     
r   zF
    ViTDet backbone, to be used with frameworks like Mask R-CNN.
    c                	       sd   e Zd Z fddZedddZeeee	e
ddejee ee ee e	dd	d
Z  ZS )VitDetBackbonec                    sV   t    t    t | _t | _ fddt jd D | _	| 
  d S )Nc                    s   g | ]
} j qS r*   )r   )r   rR   r   r*   r+   r   /  s     z+VitDetBackbone.__init__.<locals>.<listcomp>r   )r   r   Z_init_backboner   r?   r   r   r   r   Znum_featuresr   r   r(   r   r+   r   )  s    

zVitDetBackbone.__init__rt   c                 C   s   | j jS rr   r   rv   r*   r*   r+   r   4  s    z#VitDetBackbone.get_input_embeddingsr   N)r<   rb   r   r   r=   c                 C   s   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| |}| j|d||d}|rb|jn|d }d}t| j|D ]\}	}
|	| j	krz||
f7 }qz|s|r|f|dd  }n|f|dd  }|S t
||r|jnd|jdS )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import VitDetConfig, VitDetBackbone
        >>> import torch

        >>> config = VitDetConfig()
        >>> model = VitDetBackbone(config)

        >>> pixel_values = torch.randn(1, 3, 224, 224)

        >>> with torch.no_grad():
        ...     outputs = model(pixel_values)

        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 14, 14]
        ```NT)r   rb   r   r   r*   r-   )feature_mapsrs   r   )r'   r   r   rb   r?   r   rs   zipZstage_namesZout_featuresr
   r   )r&   r<   rb   r   r   r   re   rs   r   Zstagera   ro   r*   r*   r+   r@   7  s4    

zVitDetBackbone.forward)NNN)rA   rB   rC   r   r   r   r   r   r   r
   r   r!   rE   r   r   r@   rF   r*   r*   r(   r+   r   "  s   
   r   )rg   F)9rD   collections.abcr   r2   typingr   r   r   r   r   r!   Ztorch.utils.checkpointr   Zactivationsr	   Zmodeling_outputsr
   r   Zmodeling_utilsr   r   r   r   r   r   Zutils.backbone_utilsr   Zconfiguration_vitdetr   Z
get_loggerrA   loggerr   Z$VITDET_PRETRAINED_MODEL_ARCHIVE_LISTModuler   rL   rT   rU   rE   rx   r   rp   rq   rz   r   r   r   r   r   r   r   r   ZVITDET_START_DOCSTRINGr   r   r   r*   r*   r*   r+   <module>   sV   
W$)?'KI<Z