U
    ,-eʍ                     @   sN  d Z ddlZddlZddlmZmZmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZmZmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' e"(e)Z*dZ+dZ,dddgZ-dZ.dgZ/G dd dej0Z1G dd dej0Z2G dd dej0Z3G dd dej0Z4G dd dej0Z5G dd dej0Z6d<ej7e8e9ej7d!d"d#Z:G d$d% d%ej0Z;G d&d' d'ej0Z<G d(d) d)ej0Z=G d*d+ d+ej0Z>G d,d- d-ej0Z?G d.d/ d/eZ@d0ZAd1ZBd2ZCe d3eAG d4d5 d5e@ZDe d6eAG d7d8 d8e@ZEe d9eAG d:d; d;e@e%ZFdS )=z PyTorch DINOv2 model.    N)DictListOptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)BackboneMixin   )Dinov2Configr   zfacebook/dinov2-basei  i   c                       s^   e Zd ZdZedd fddZejeeejdddZ	deje
ej ejd	d
dZ  ZS )Dinov2EmbeddingszM
    Construct the CLS token, mask token, position and patch embeddings.
    Nconfigreturnc                    s~   t    ttdd|j| _ttd|j| _	t
|| _| jj}ttd|d |j| _t|j| _|| _d S )Nr   )super__init__r   	ParametertorchZrandnhidden_size	cls_tokenZzeros
mask_tokenDinov2PatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropoutr   )selfr   r*   	__class__ k/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/dinov2/modeling_dinov2.pyr"   G   s    

zDinov2Embeddings.__init__)
embeddingsheightwidthr    c           	      C   sR  |j d d }| jj d d }||kr4||kr4| jS | jdddf }| jddddf }|j d }|| jj }|| jj }|d |d  }}|dtt|tt||}|dddd}t	j
j||t| |t| fdd	d
}t||j d kst||j d kr td|dddddd|}tj|d|fddS )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
        resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
        r   Nr   g?r      ZbicubicF)Zscale_factormodeZalign_cornerszHWidth or height does not match with the interpolated position embeddingsdim)shaper+   r   
patch_sizereshapeintmathsqrtpermuter   
functionalZinterpolate
ValueErrorviewr$   cat	unsqueeze)	r/   r4   r5   r6   r*   Znum_positionsZclass_pos_embedZpatch_pos_embedr<   r2   r2   r3   interpolate_pos_encodingR   s,    	
$(z)Dinov2Embeddings.interpolate_pos_encoding)pixel_valuesbool_masked_posr    c           	      C   s   |j \}}}}| |}|d k	rDt|d| j|jd|}| j	|dd}tj
||fdd}|| ||| }| |}|S )Nr7   r   r   r;   )r=   r)   r$   whererH   r'   todtyper&   expandrG   rI   r.   )	r/   rJ   rK   
batch_size_r5   r6   r4   Z
cls_tokensr2   r2   r3   forwardt   s    
  
zDinov2Embeddings.forward)N)__name__
__module____qualname____doc__r   r"   r$   Tensorr@   rI   r   rR   __classcell__r2   r2   r0   r3   r   B   s   "r   c                       s4   e Zd ZdZ fddZejejdddZ  ZS )r(   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j }}|j|j }}t|tjj	r8|n||f}t|tjj	rR|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )Zkernel_sizeZstride)r!   r"   
image_sizer>   num_channelsr%   
isinstancecollectionsabcIterabler*   r   Conv2d
projection)r/   r   rY   r>   rZ   r%   r*   r0   r2   r3   r"      s    
 zDinov2PatchEmbeddings.__init__)rJ   r    c                 C   sH   |j d }|| jkr,td| j d| d| |ddd}|S )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r8   )r=   rZ   rE   r`   flatten	transpose)r/   rJ   rZ   r4   r2   r2   r3   rR      s    

zDinov2PatchEmbeddings.forward)	rS   rT   rU   rV   r"   r$   rW   rR   rX   r2   r2   r0   r3   r(      s   r(   c                       sl   e Zd Zedd fddZejejdddZdeej e	e
eejejf eej f d	d
dZ  ZS )Dinov2SelfAttentionNr   c                    s   t    |j|j dkr@t|ds@td|jf d|j d|j| _t|j|j | _| j| j | _t	j
|j| j|jd| _t	j
|j| j|jd| _t	j
|j| j|jd| _t	|j| _d S )Nr   Zembedding_sizezThe hidden size z4 is not a multiple of the number of attention heads ra   bias)r!   r"   r%   num_attention_headshasattrrE   r@   attention_head_sizeall_head_sizer   LinearZqkv_biasquerykeyvaluer,   Zattention_probs_dropout_probr.   r/   r   r0   r2   r3   r"      s    
zDinov2SelfAttention.__init__)xr    c                 C   s6   |  d d | j| jf }||}|ddddS )Nr7   r   r8   r   r   )sizerg   ri   rF   rC   )r/   rp   Znew_x_shaper2   r2   r3   transpose_for_scores   s    
z(Dinov2SelfAttention.transpose_for_scoresF)	head_maskoutput_attentionsr    c                 C   s   |  |}| | |}| | |}| |}t||dd}|t| j	 }t
jj|dd}	| |	}	|d k	r|	| }	t|	|}
|
dddd }
|
 d d | jf }|
|}
|r|
|	fn|
f}|S )Nr7   r:   r;   r   r8   r   r   )rl   rr   rm   rn   r$   matmulrc   rA   rB   ri   r   rD   Zsoftmaxr.   rC   
contiguousrq   rj   rF   )r/   hidden_statesrs   rt   Zmixed_query_layerZ	key_layerZvalue_layerZquery_layerZattention_scoresZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr2   r2   r3   rR      s     



zDinov2SelfAttention.forward)NF)rS   rT   rU   r   r"   r$   rW   rr   r   boolr   r   rR   rX   r2   r2   r0   r3   rd      s       rd   c                       s@   e Zd ZdZedd fddZejejejdddZ  Z	S )	Dinov2SelfOutputz
    The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    Nr   c                    s.   t    t|j|j| _t|j| _d S N)	r!   r"   r   rk   r%   denser,   r-   r.   ro   r0   r2   r3   r"      s    
zDinov2SelfOutput.__init__)rw   input_tensorr    c                 C   s   |  |}| |}|S r{   )r|   r.   )r/   rw   r}   r2   r2   r3   rR      s    

zDinov2SelfOutput.forward)
rS   rT   rU   rV   r   r"   r$   rW   rR   rX   r2   r2   r0   r3   rz      s   rz   c                       sp   e Zd Zedd fddZee ddddZdej	e
ej	 eeeej	ej	f eej	 f d	d
dZ  ZS )Dinov2AttentionNr   c                    s*   t    t|| _t|| _t | _d S r{   )r!   r"   rd   	attentionrz   outputsetpruned_headsro   r0   r2   r3   r"      s    


zDinov2Attention.__init__)headsr    c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r;   )lenr   r   rg   ri   r   r   rl   rm   rn   r   r|   rj   union)r/   r   indexr2   r2   r3   prune_heads  s       zDinov2Attention.prune_headsFrw   rs   rt   r    c                 C   s4   |  |||}| |d |}|f|dd   }|S )Nr   r   )r   r   )r/   rw   rs   rt   Zself_outputsattention_outputrx   r2   r2   r3   rR     s    zDinov2Attention.forward)NF)rS   rT   rU   r   r"   r   r@   r   r$   rW   r   ry   r   r   rR   rX   r2   r2   r0   r3   r~      s     r~   c                       s6   e Zd Zdd fddZejejdddZ  ZS )Dinov2LayerScaleNr    c                    s(   t    t|jt|j | _d S r{   )	r!   r"   r   r#   Zlayerscale_valuer$   Zonesr%   lambda1ro   r0   r2   r3   r"   #  s    
zDinov2LayerScale.__init__hidden_stater    c                 C   s
   || j  S r{   )r   r/   r   r2   r2   r3   rR   '  s    zDinov2LayerScale.forwardrS   rT   rU   r"   r$   rW   rR   rX   r2   r2   r0   r3   r   "  s   r           F)input	drop_probtrainingr    c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )rN   device)r=   ndimr$   ZrandrN   r   Zfloor_div)r   r   r   Z	keep_probr=   Zrandom_tensorr   r2   r2   r3   	drop_path,  s    
r   c                       sP   e Zd ZdZdee dd fddZejejdddZ	e
d	d
dZ  ZS )Dinov2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).N)r   r    c                    s   t    || _d S r{   )r!   r"   r   )r/   r   r0   r2   r3   r"   D  s    
zDinov2DropPath.__init__)rw   r    c                 C   s   t || j| jS r{   )r   r   r   )r/   rw   r2   r2   r3   rR   H  s    zDinov2DropPath.forwardr   c                 C   s   d | jS )Nzp={})formatr   r/   r2   r2   r3   
extra_reprK  s    zDinov2DropPath.extra_repr)N)rS   rT   rU   rV   r   floatr"   r$   rW   rR   strr   rX   r2   r2   r0   r3   r   A  s   r   c                       s6   e Zd Zdd fddZejejdddZ  ZS )	Dinov2MLPNr   c                    sn   t    |j }}t|j|j }tj||dd| _t|j	t
rPt|j	 | _n|j	| _tj||dd| _d S )NTre   )r!   r"   r%   r@   	mlp_ratior   rk   fc1r[   Z
hidden_actr   r   
activationfc2r/   r   Zin_featuresout_featuresZhidden_featuresr0   r2   r3   r"   P  s    

zDinov2MLP.__init__r   c                 C   s"   |  |}| |}| |}|S r{   )r   r   r   r   r2   r2   r3   rR   [  s    


zDinov2MLP.forwardr   r2   r2   r0   r3   r   O  s   r   c                       s6   e Zd Zdd fddZejejdddZ  ZS )Dinov2SwiGLUFFNNr   c                    sl   t    |j }}t|j|j }t|d d d d d }tj|d| dd| _tj||dd| _d S )Nr8   r         Tre   )	r!   r"   r%   r@   r   r   rk   
weights_inweights_outr   r0   r2   r3   r"   c  s    

zDinov2SwiGLUFFN.__init__r   c                 C   s6   |  |}|jddd\}}tj|| }| |S )Nr8   r7   r;   )r   chunkr   rD   Zsilur   )r/   r   x1Zx2Zhiddenr2   r2   r3   rR   l  s    
zDinov2SwiGLUFFN.forwardr   r2   r2   r0   r3   r   b  s   	r   c                       s`   e Zd ZdZedd fddZd
ejeej e	e
eejejf eej f ddd	Z  ZS )Dinov2LayerzCThis corresponds to the Block class in the original implementation.Nr   c                    s   t    tj|j|jd| _t|| _t	|| _
|jdkrFt|jnt | _tj|j|jd| _|jrvt|| _n
t|| _t	|| _|jdkrt|jnt | _d S )NZepsr   )r!   r"   r   	LayerNormr%   layer_norm_epsnorm1r~   r   r   layer_scale1Zdrop_path_rater   IdentityZ
drop_path1norm2Zuse_swiglu_ffnr   mlpr   layer_scale2Z
drop_path2ro   r0   r2   r3   r"   v  s    




zDinov2Layer.__init__Fr   c                 C   sp   | j | |||d}|d }| |}|dd  }|| }| |}| |}| |}|| }|f| }|S )Nrt   r   r   )r   r   r   r   r   r   )r/   rw   rs   rt   Zself_attention_outputsr   rx   Zlayer_outputr2   r2   r3   rR     s    




zDinov2Layer.forward)NF)rS   rT   rU   rV   r   r"   r$   rW   r   ry   r   r   rR   rX   r2   r2   r0   r3   r   s  s     r   c                	       sN   e Zd Zedd fddZd
ejeej eeee	e
ef ddd	Z  ZS )Dinov2EncoderNr   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r2   )r   .0rQ   r   r2   r3   
<listcomp>  s     z*Dinov2Encoder.__init__.<locals>.<listcomp>F)	r!   r"   r   r   Z
ModuleListrangenum_hidden_layerslayergradient_checkpointingro   r0   r   r3   r"     s    
 zDinov2Encoder.__init__FT)rw   rs   rt   output_hidden_statesreturn_dictr    c                    s   |rdnd } rdnd }t | jD ]\}}	|r8||f }|d k	rH|| nd }
| jr|| jr| fdd}tjj||	||
}n|	||
 }|d } r"||d f }q"|r||f }|stdd |||fD S t|||dS )	Nr2   c                    s    fdd}|S )Nc                     s    | f S r{   r2   )inputs)modulert   r2   r3   custom_forward  s    zLDinov2Encoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr2   )r   r   r   )r   r3   create_custom_forward  s    z4Dinov2Encoder.forward.<locals>.create_custom_forwardr   r   c                 s   s   | ]}|d k	r|V  qd S r{   r2   )r   vr2   r2   r3   	<genexpr>  s      z(Dinov2Encoder.forward.<locals>.<genexpr>)last_hidden_staterw   
attentions)		enumerater   r   r   r$   utils
checkpointtupler   )r/   rw   rs   rt   r   r   Zall_hidden_statesZall_self_attentionsiZlayer_moduleZlayer_head_maskr   Zlayer_outputsr2   r   r3   rR     s4    

zDinov2Encoder.forward)NFFT)rS   rT   rU   r   r"   r$   rW   r   ry   r   r   r   rR   rX   r2   r2   r0   r3   r     s   	    
r   c                   @   sT   e Zd ZdZeZdZdZdZe	e
je
je
jf ddddZdeedd
ddZdS )Dinov2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    dinov2rJ   TN)r   r    c                 C   s   t |tjtjfrZtjj|jjt	j
d| jjd|jj|j_|jdk	r|jj  nt |tjr|jj  |jjd njt |trtjj|jjt	j
d| jjd|jj|j_tjj|jjt	j
d| jjd|jj|j_dS )zInitialize the weightsr   )meanZstdNg      ?)r[   r   rk   r_   initZtrunc_normal_weightdatarM   r$   Zfloat32r   Zinitializer_rangerN   rf   Zzero_r   Zfill_r   r+   r&   )r/   r   r2   r2   r3   _init_weights  s8      

z#Dinov2PreTrainedModel._init_weightsF)r   rn   r    c                 C   s   t |tr||_d S r{   )r[   r   r   )r/   r   rn   r2   r2   r3   _set_gradient_checkpointing  s    
z1Dinov2PreTrainedModel._set_gradient_checkpointing)F)rS   rT   rU   rV   r   config_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingr   r   rk   r_   r   r   r   ry   r   r2   r2   r2   r3   r     s    r   aH  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`Dinov2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a4  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`BitImageProcessor.preprocess`] for details.

        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
            pre-training.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
aM  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`BitImageProcessor.preprocess`] for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z`The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Zed fddZedddZeee	e f ddd	d
Z
eeeeeededdeej eej eej ee ee ee eeef dddZ  ZS )Dinov2Modelr   c                    sF   t  | || _t|| _t|| _tj|j	|j
d| _|   d S )Nr   )r!   r"   r   r   r4   r   encoderr   r   r%   r   	layernorm	post_initro   r0   r2   r3   r"   L  s    

zDinov2Model.__init__r   c                 C   s   | j jS r{   r4   r)   r   r2   r2   r3   get_input_embeddingsX  s    z Dinov2Model.get_input_embeddingsN)heads_to_pruner    c                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r/   r   r   r   r2   r2   r3   _prune_heads[  s    zDinov2Model._prune_headsZvision)r   output_typer   ZmodalityZexpected_output)rJ   rK   rs   rt   r   r   r    c                 C   s   |d k	r|n| j j}|d k	r |n| j j}|d k	r4|n| j j}|d krLtd| || j j}| j||d}| j|||||d}|d }	| 	|	}	|	d d dd d f }
|s|	|
f}||dd   S t
|	|
|j|jdS )Nz You have to specify pixel_values)rK   rs   rt   r   r   r   r   )r   Zpooler_outputrw   r   )r   rt   r   use_return_dictrE   Zget_head_maskr   r4   r   r   r   rw   r   )r/   rJ   rK   rs   rt   r   r   embedding_outputZencoder_outputssequence_outputZpooled_outputZhead_outputsr2   r2   r3   rR   c  s6    
zDinov2Model.forward)NNNNNN)rS   rT   rU   r   r"   r(   r   r   r@   r   r   r   DINOV2_BASE_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r$   rW   ry   r   r   rR   rX   r2   r2   r0   r3   r   G  s4   	      
r   z
    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.
    c                       s|   e Zd Zedd fddZeeeee	e
dd	eej eej eej ee ee ee eee	f dddZ  ZS )
Dinov2ForImageClassificationNr   c                    sR   t  | |j| _t|| _|jdkr<t|jd |jnt | _	| 
  d S )Nr   r8   )r!   r"   
num_labelsr   r   r   rk   r%   r   
classifierr   ro   r0   r2   r3   r"     s    
$z%Dinov2ForImageClassification.__init__)r   r   r   )rJ   rs   labelsrt   r   r   r    c                 C   s  |dk	r|n| j j}| j|||||d}|d }|dddf }	|ddddf }
tj|	|
jddgdd}| |}d}|dk	rt||j}| j j	dkr| j
dkrd| j _	n4| j
dkr|jtjks|jtjkrd| j _	nd| j _	| j j	dkr&t }| j
dkr|| | }n
|||}nN| j j	dkrVt }||d	| j
|d	}n| j j	dkrtt }|||}|s|f|d
d  }|dk	r|f| S |S t|||j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   r;   Z
regressionZsingle_label_classificationZmulti_label_classificationr7   r8   )losslogitsrw   r   )r   r   r   r$   rG   r   r   rM   r   Zproblem_typer   rN   longr@   r   Zsqueezer
   rF   r	   r   rw   r   )r/   rJ   rs   r   rt   r   r   rx   r   r&   Zpatch_tokensZlinear_inputr   r   Zloss_fctr   r2   r2   r3   rR     sT    



"

z$Dinov2ForImageClassification.forward)NNNNNN)rS   rT   rU   r   r"   r   DINOV2_INPUTS_DOCSTRINGr   _IMAGE_CLASS_CHECKPOINTr   r   r   r$   rW   ry   r   r   rR   rX   r2   r2   r0   r3   r     s,         
r   zO
    Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
    c                	       sd   e Zd Z fddZedddZeeee	e
ddejee ee ee e	dd	d
Z  ZS )Dinov2Backbonec                    sj   t    t     fddt jd D | _t | _t | _	t
j j jd| _|   d S )Nc                    s   g | ]
} j qS r2   )r%   r   r   r2   r3   r     s     z+Dinov2Backbone.__init__.<locals>.<listcomp>r   r   )r!   r"   Z_init_backboner   r   Znum_featuresr   r4   r   r   r   r   r%   r   r   r   ro   r0   r   r3   r"     s    

zDinov2Backbone.__init__r   c                 C   s   | j jS r{   r   r   r2   r2   r3   r     s    z#Dinov2Backbone.get_input_embeddings)r   r   N)rJ   r   rt   r   r    c                 C   sh  |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| |}| j|d||d}|rb|jn|d }d}t| j|D ]\}	}
|	| j	krz| j j
r| |
}
| j jr |j\}}}}| j j}|
ddddddf ||| || d}
|
dddd	 }
||
f7 }qz|sB|r,|f|dd  }n|f|d	d  }|S t||rR|jnd|r`|jndd
S )a7  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
        >>> model = AutoBackbone.from_pretrained(
        ...     "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 16, 16]
        ```NT)r   rt   r   r   r2   r7   r   r   r8   )feature_mapsrw   r   )r   r   r   rt   r4   r   rw   zipZstage_namesr   Zapply_layernormr   Zreshape_hidden_statesr=   r>   r?   rC   rv   r   r   )r/   rJ   r   rt   r   r   rx   rw   r   Zstager   rP   rQ   r5   r6   r>   r   r2   r2   r3   rR     sL    #
   


   zDinov2Backbone.forward)NNN)rS   rT   rU   r"   r(   r   r   r   r   r   r   r$   rW   r   ry   rR   rX   r2   r2   r0   r3   r      s   
   r   )r   F)GrV   collections.abcr\   rA   typingr   r   r   r   r   r   r$   Ztorch.utils.checkpointr   Ztorch.nnr	   r
   r   Zactivationsr   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   r   r   r   r   r   Zutils.backbone_utilsr   Zconfiguration_dinov2r   Z
get_loggerrS   loggerr   r   r   r   Z$DINOV2_PRETRAINED_MODEL_ARCHIVE_LISTModuler   r(   rd   rz   r~   r   rW   r   ry   r   r   r   r   r   r   r   ZDINOV2_START_DOCSTRINGr   r   r   r   r   r2   r2   r2   r3   <module>   sh    

G"='
59*R\