U
    9%e                     @   s  d Z ddlZddlZddlmZmZmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZmZmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZm Z m!Z!m"Z"m#Z# ddl$m%Z% e"&e'Z(dZ)dZ*dddgZ+dZ,dZ-dgZ.G dd dej/Z0G dd dej/Z1G dd dej/Z2G dd dej/Z3G dd dej/Z4G dd dej/Z5G d d! d!ej/Z6G d"d# d#ej/Z7G d$d% d%ej/Z8G d&d' d'eZ9d(Z:d)Z;e d*e:G d+d, d,e9Z<G d-d. d.ej/Z=e d/e:G d0d1 d1e9Z>e d2e:G d3d4 d4e9Z?dS )5z PyTorch ViT model.    N)DictListOptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedImageModelingOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )	ViTConfigr   z!google/vit-base-patch16-224-in21k   i   zgoogle/vit-base-patch16-224zEgyptian catc                       sd   e Zd ZdZdeedd fddZeje	e	ejddd	Z
dejeej eejd
ddZ  ZS )ViTEmbeddingszb
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    FN)configuse_mask_tokenreturnc                    s   t    ttdd|j| _|r<ttdd|jnd | _	t
|| _| jj}ttd|d |j| _t|j| _|| _d S )Nr   )super__init__r   	ParametertorchZrandnhidden_size	cls_tokenZzeros
mask_tokenViTPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropoutr   )selfr   r   r*   	__class__ c/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/vit/modeling_vit.pyr"   G   s    
 
zViTEmbeddings.__init__)
embeddingsheightwidthr    c                 C   sN  |j d d }| jj d d }||kr4||kr4| jS | jdddf }| jddddf }|j d }|| jj }	|| jj }
|	d |
d  }	}
|dtt|tt||}|dddd}t	j
j||	t| |
t| fdd	d
}t|	|j d krt|
|j d kst|dddddd|}tj|d|fddS )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
        resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
        r   Nr   g?r      ZbicubicF)Zscale_factormodeZalign_cornersdim)shaper+   r   
patch_sizereshapeintmathsqrtpermuter   
functionalZinterpolateAssertionErrorviewr$   cat	unsqueeze)r/   r4   r5   r6   r*   Znum_positionsZclass_pos_embedZpatch_pos_embedr<   Zh0Zw0r2   r2   r3   interpolate_pos_encodingR   s*    	
$,z&ViTEmbeddings.interpolate_pos_encoding)pixel_valuesbool_masked_posrI   r    c                 C   s   |j \}}}}| j||d}|d k	rb|j d }	| j||	d}
|d|
}|d|  |
|  }| j|dd}tj||fdd}|r|| 	||| }n
|| j
 }| |}|S )N)rI   r   r7         ?r;   )r=   r)   r'   expandrH   Ztype_asr&   r$   rG   rI   r+   r.   )r/   rJ   rK   rI   
batch_sizenum_channelsr5   r6   r4   Z
seq_lengthZmask_tokensmaskZ
cls_tokensr2   r2   r3   forwards   s    


zViTEmbeddings.forward)F)NF)__name__
__module____qualname____doc__r   boolr"   r$   Tensorr@   rI   r   
BoolTensorrQ   __classcell__r2   r2   r0   r3   r   B   s   $  r   c                       s8   e Zd ZdZ fddZdejeejdddZ  Z	S )	r(   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j }}|j|j }}t|tjj	r8|n||f}t|tjj	rR|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizeZstride)r!   r"   
image_sizer>   rO   r%   
isinstancecollectionsabcIterabler*   r   Conv2d
projection)r/   r   r[   r>   rO   r%   r*   r0   r2   r3   r"      s    
 zViTPatchEmbeddings.__init__F)rJ   rI   r    c              
   C   s   |j \}}}}|| jkr0td| j d| d|s~|| jd ksP|| jd kr~td| d| d| jd  d| jd  d		| |d
dd
}|S )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).r8   )r=   rO   
ValueErrorr[   ra   flatten	transpose)r/   rJ   rI   rN   rO   r5   r6   r4   r2   r2   r3   rQ      s    
(zViTPatchEmbeddings.forward)F)
rR   rS   rT   rU   r"   r$   rW   rV   rQ   rY   r2   r2   r0   r3   r(      s   r(   c                       sl   e Zd Zedd fddZejejdddZdeej e	e
eejejf eej f d	d
dZ  ZS )ViTSelfAttentionNr   r    c                    s   t    |j|j dkr@t|ds@td|jf d|j d|j| _t|j|j | _| j| j | _t	j
|j| j|jd| _t	j
|j| j|jd| _t	j
|j| j|jd| _t	|j| _d S )Nr   Zembedding_sizezThe hidden size z4 is not a multiple of the number of attention heads rb   )bias)r!   r"   r%   num_attention_headshasattrrd   r@   attention_head_sizeall_head_sizer   LinearZqkv_biasquerykeyvaluer,   Zattention_probs_dropout_probr.   r/   r   r0   r2   r3   r"      s    
zViTSelfAttention.__init__)xr    c                 C   s6   |  d d | j| jf }||}|ddddS )Nr7   r   r8   r   r   )sizerj   rl   rF   rC   )r/   rs   Znew_x_shaper2   r2   r3   transpose_for_scores   s    
z%ViTSelfAttention.transpose_for_scoresF)	head_maskoutput_attentionsr    c                 C   s   |  |}| | |}| | |}| |}t||dd}|t| j	 }t
jj|dd}	| |	}	|d k	r|	| }	t|	|}
|
dddd }
|
 d d | jf }|
|}
|r|
|	fn|
f}|S )Nr7   r:   r;   r   r8   r   r   )ro   ru   rp   rq   r$   matmulrf   rA   rB   rl   r   rD   Zsoftmaxr.   rC   
contiguousrt   rm   rF   )r/   hidden_statesrv   rw   Zmixed_query_layerZ	key_layerZvalue_layerZquery_layerZattention_scoresZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr2   r2   r3   rQ      s     



zViTSelfAttention.forward)NF)rR   rS   rT   r   r"   r$   rW   ru   r   rV   r   r   rQ   rY   r2   r2   r0   r3   rg      s       rg   c                       s@   e Zd ZdZedd fddZejejejdddZ  Z	S )	ViTSelfOutputz
    The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    Nrh   c                    s.   t    t|j|j| _t|j| _d S N)	r!   r"   r   rn   r%   denser,   r-   r.   rr   r0   r2   r3   r"      s    
zViTSelfOutput.__init__rz   input_tensorr    c                 C   s   |  |}| |}|S r}   r~   r.   r/   rz   r   r2   r2   r3   rQ      s    

zViTSelfOutput.forward)
rR   rS   rT   rU   r   r"   r$   rW   rQ   rY   r2   r2   r0   r3   r|      s   r|   c                       sp   e Zd Zedd fddZee ddddZdej	e
ej	 eeeej	ej	f eej	 f d	d
dZ  ZS )ViTAttentionNrh   c                    s*   t    t|| _t|| _t | _d S r}   )r!   r"   rg   	attentionr|   outputsetpruned_headsrr   r0   r2   r3   r"     s    


zViTAttention.__init__)headsr    c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r;   )lenr   r   rj   rl   r   r   ro   rp   rq   r   r~   rm   union)r/   r   indexr2   r2   r3   prune_heads  s       zViTAttention.prune_headsFrz   rv   rw   r    c                 C   s4   |  |||}| |d |}|f|dd   }|S )Nr   r   )r   r   )r/   rz   rv   rw   Zself_outputsattention_outputr{   r2   r2   r3   rQ      s    zViTAttention.forward)NF)rR   rS   rT   r   r"   r   r@   r   r$   rW   r   rV   r   r   rQ   rY   r2   r2   r0   r3   r     s     r   c                       s8   e Zd Zedd fddZejejdddZ  ZS )ViTIntermediateNrh   c                    sB   t    t|j|j| _t|jt	r6t
|j | _n|j| _d S r}   )r!   r"   r   rn   r%   intermediate_sizer~   r\   Z
hidden_actstrr   intermediate_act_fnrr   r0   r2   r3   r"   /  s
    
zViTIntermediate.__init__)rz   r    c                 C   s   |  |}| |}|S r}   )r~   r   )r/   rz   r2   r2   r3   rQ   7  s    

zViTIntermediate.forward	rR   rS   rT   r   r"   r$   rW   rQ   rY   r2   r2   r0   r3   r   .  s   r   c                       s<   e Zd Zedd fddZejejejdddZ  ZS )	ViTOutputNrh   c                    s.   t    t|j|j| _t|j| _	d S r}   )
r!   r"   r   rn   r   r%   r~   r,   r-   r.   rr   r0   r2   r3   r"   ?  s    
zViTOutput.__init__r   c                 C   s    |  |}| |}|| }|S r}   r   r   r2   r2   r3   rQ   D  s    

zViTOutput.forwardr   r2   r2   r0   r3   r   >  s   r   c                       s`   e Zd ZdZedd fddZd
ejeej e	e
eejejf eej f ddd	Z  ZS )ViTLayerz?This corresponds to the Block class in the timm implementation.Nrh   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   Zeps)r!   r"   Zchunk_size_feed_forwardZseq_len_dimr   r   r   intermediater   r   r   	LayerNormr%   layer_norm_epslayernorm_beforelayernorm_afterrr   r0   r2   r3   r"   P  s    



zViTLayer.__init__Fr   c                 C   s`   | j | |||d}|d }|dd  }|| }| |}| |}| ||}|f| }|S )Nrw   r   r   )r   r   r   r   r   )r/   rz   rv   rw   Zself_attention_outputsr   r{   Zlayer_outputr2   r2   r3   rQ   Z  s    


zViTLayer.forward)NF)rR   rS   rT   rU   r   r"   r$   rW   r   rV   r   r   rQ   rY   r2   r2   r0   r3   r   M  s     r   c                	       sN   e Zd Zedd fddZd
ejeej eeee	e
ef ddd	Z  ZS )
ViTEncoderNrh   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r2   )r   ).0_r   r2   r3   
<listcomp>{  s     z'ViTEncoder.__init__.<locals>.<listcomp>F)	r!   r"   r   r   Z
ModuleListrangenum_hidden_layerslayergradient_checkpointingrr   r0   r   r3   r"   x  s    
 zViTEncoder.__init__FT)rz   rv   rw   output_hidden_statesreturn_dictr    c                    s   |rdnd } rdnd }t | jD ]\}}	|r8||f }|d k	rH|| nd }
| jr|| jr| fdd}tjj||	||
}n|	||
 }|d } r"||d f }q"|r||f }|stdd |||fD S t|||dS )	Nr2   c                    s    fdd}|S )Nc                     s    | f S r}   r2   )inputs)modulerw   r2   r3   custom_forward  s    zIViTEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr2   )r   r   r   )r   r3   create_custom_forward  s    z1ViTEncoder.forward.<locals>.create_custom_forwardr   r   c                 s   s   | ]}|d k	r|V  qd S r}   r2   )r   vr2   r2   r3   	<genexpr>  s      z%ViTEncoder.forward.<locals>.<genexpr>)last_hidden_staterz   
attentions)		enumerater   r   Ztrainingr$   utils
checkpointtupler   )r/   rz   rv   rw   r   r   Zall_hidden_statesZall_self_attentionsiZlayer_moduleZlayer_head_maskr   Zlayer_outputsr2   r   r3   rQ   ~  s4    

zViTEncoder.forward)NFFT)rR   rS   rT   r   r"   r$   rW   r   rV   r   r   r   rQ   rY   r2   r2   r0   r3   r   w  s   	    
r   c                   @   s\   e Zd ZdZeZdZdZdZddgZ	e
ejejejf ddd	d
ZdeeddddZdS )ViTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    vitrJ   Tr   r   N)r   r    c                 C   s   t |tjtjfrZtjj|jjt	j
d| jjd|jj|j_|jdk	r|jj  nt |tjr|jj  |jjd njt |trtjj|jjt	j
d| jjd|jj|j_tjj|jjt	j
d| jjd|jj|j_dS )zInitialize the weightsg        )ZmeanZstdNrL   )r\   r   rn   r`   initZtrunc_normal_weightdatator$   Zfloat32r   Zinitializer_rangedtyperi   Zzero_r   Zfill_r   r+   r&   )r/   r   r2   r2   r3   _init_weights  s8      

z ViTPreTrainedModel._init_weightsF)r   rq   r    c                 C   s   t |tr||_d S r}   )r\   r   r   )r/   r   rq   r2   r2   r3   _set_gradient_checkpointing  s    
z.ViTPreTrainedModel._set_gradient_checkpointing)F)rR   rS   rT   rU   r   config_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesr   r   rn   r`   r   r   r   rV   r   r2   r2   r2   r3   r     s    r   aE  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`ViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
            for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        interpolate_pos_encoding (`bool`, *optional*):
            Whether to interpolate the pre-trained position encodings.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z]The bare ViT Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Zdeeed fddZedddZee	e
e	 f d	d
ddZeeeeeededdeej eej eej ee ee ee ee eeef dddZ  ZS )ViTModelTF)r   add_pooling_layerr   c                    s\   t  | || _t||d| _t|| _tj|j	|j
d| _|rJt|nd | _|   d S )N)r   r   )r!   r"   r   r   r4   r   encoderr   r   r%   r   	layernorm	ViTPoolerpooler	post_init)r/   r   r   r   r0   r2   r3   r"     s    
zViTModel.__init__)r    c                 C   s   | j jS r}   )r4   r)   )r/   r2   r2   r3   get_input_embeddings  s    zViTModel.get_input_embeddingsN)heads_to_pruner    c                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r/   r   r   r   r2   r2   r3   _prune_heads  s    zViTModel._prune_headsZvision)r   output_typer   Zmodalityexpected_outputrJ   rK   rv   rw   r   rI   r   r    c                 C   s
  |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}|dkrLtd| || j j}| jjj	j
j}|j|kr~||}| j|||d}	| j|	||||d}
|
d }| |}| jdk	r| |nd}|s|dk	r||fn|f}||
dd  S t|||
j|
jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rK   rI   )rv   rw   r   r   r   r   )r   Zpooler_outputrz   r   )r   rw   r   use_return_dictrd   Zget_head_maskr   r4   r)   ra   r   r   r   r   r   r   r   rz   r   )r/   rJ   rK   rv   rw   r   rI   r   Zexpected_dtypeZembedding_outputZencoder_outputssequence_outputpooled_outputZhead_outputsr2   r2   r3   rQ     sD    

  
zViTModel.forward)TF)NNNNNNN)rR   rS   rT   r   rV   r"   r(   r   r   r@   r   r   r   VIT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r$   rW   rX   r   r   rQ   rY   r2   r2   r0   r3   r     s8   	       
r   c                       s*   e Zd Zed fddZdd Z  ZS )r   r   c                    s*   t    t|j|j| _t | _d S r}   )r!   r"   r   rn   r%   r~   ZTanh
activationrr   r0   r2   r3   r"   c  s    
zViTPooler.__init__c                 C   s(   |d d df }|  |}| |}|S )Nr   )r~   r   )r/   rz   Zfirst_token_tensorr   r2   r2   r3   rQ   h  s    

zViTPooler.forward)rR   rS   rT   r   r"   rQ   rY   r2   r2   r0   r3   r   b  s   r   aV  ViT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    c                       s   e Zd Zedd fddZeeeee	dd	e
ej e
ej e
ej e
e e
e e
e e
e eeef dddZ  ZS )
ViTForMaskedImageModelingNrh   c                    sX   t  | t|ddd| _ttj|j|jd |j	 ddt
|j| _|   d S )NFT)r   r   r8   r   )Zin_channelsZout_channelsrZ   )r!   r"   r   r   r   Z
Sequentialr`   r%   encoder_striderO   ZPixelShuffledecoderr   rr   r0   r2   r3   r"   ~  s    

z"ViTForMaskedImageModeling.__init__)r   r   r   c              	   C   s  |dk	r|n| j j}|dk	rJ| j j| j jkrJtd| j j d| j j d| j|||||||d}|d }	|	ddddf }	|	j\}
}}t|d  }}|		dd	d
|
|||}	| |	}d}|dk	rD| j j| j j }|
d
||}|| j jd| j jd	d }tjj||dd}||  | d  | j j }|st|f|dd  }|dk	rp|f| S |S t|||j|jdS )a=  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, ViTForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
        >>> model = ViTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 224, 224]
        ```NzWhen `bool_masked_pos` is provided, `patch_size` must be equal to `encoder_stride` to ensure that the reconstructed image has the same dimensions as the input.Got `patch_size` = z and `encoder_stride` = rb   )rK   rv   rw   r   rI   r   r   r   g      ?r8   r7   none)Z	reductiongh㈵>)lossZreconstructionrz   r   )r   r   r>   r   rd   r   r=   rA   floorrC   r?   r   r[   Zrepeat_interleaverH   ry   r   rD   Zl1_losssumrO   r   rz   r   )r/   rJ   rK   rv   rw   r   rI   r   r{   r   rN   Zsequence_lengthrO   r5   r6   Zreconstructed_pixel_valuesZmasked_im_lossrt   rP   Zreconstruction_lossr   r2   r2   r3   rQ     sT    )


  z!ViTForMaskedImageModeling.forward)NNNNNNN)rR   rS   rT   r   r"   r   r   r   r   r   r   r$   rW   rX   rV   r   r   rQ   rY   r2   r2   r0   r3   r   q  s(   
       
r   a  
    ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                       s   e Zd Zedd fddZeeeee	e
edd	eej eej eej ee ee ee ee eee	f dddZ  ZS )
ViTForImageClassificationNrh   c                    sR   t  | |j| _t|dd| _|jdkr<t|j|jnt | _	| 
  d S )NF)r   r   )r!   r"   
num_labelsr   r   r   rn   r%   ZIdentity
classifierr   rr   r0   r2   r3   r"     s
    $z"ViTForImageClassification.__init__)r   r   r   r   )rJ   rv   labelsrw   r   rI   r   r    c                 C   s  |dk	r|n| j j}| j||||||d}|d }	| |	dddddf }
d}|dk	rF||
j}| j jdkr| jdkrd| j _n4| jdkr|jt	j
ks|jt	jkrd| j _nd| j _| j jdkrt }| jdkr||
 | }n
||
|}nN| j jdkr(t }||
d| j|d}n| j jdkrFt }||
|}|sv|
f|dd  }|dk	rr|f| S |S t||
|j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)rv   rw   r   rI   r   r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationr7   )r   logitsrz   r   )r   r   r   r   r   ZdeviceZproblem_typer   r   r$   longr@   r   Zsqueezer
   rF   r	   r   rz   r   )r/   rJ   rv   r   rw   r   rI   r   r{   r   r   r   Zloss_fctr   r2   r2   r3   rQ     sP    	


"


z!ViTForImageClassification.forward)NNNNNNN)rR   rS   rT   r   r"   r   r   r   _IMAGE_CLASS_CHECKPOINTr   r   _IMAGE_CLASS_EXPECTED_OUTPUTr   r$   rW   rV   r   r   rQ   rY   r2   r2   r0   r3   r     s2          
r   )@rU   collections.abcr]   rA   typingr   r   r   r   r   r   r$   Ztorch.utils.checkpointr   Ztorch.nnr	   r
   r   Zactivationsr   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   r   r   r   r   r   Zconfiguration_vitr   Z
get_loggerrR   loggerr   r   r   r   r   Z!VIT_PRETRAINED_MODEL_ARCHIVE_LISTModuler   r(   rg   r|   r   r   r   r   r   r   ZVIT_START_DOCSTRINGr   r   r   r   r   r2   r2   r2   r3   <module>   sb    

P'<'*9+_	r