U
    9%eP                     @   s  d Z ddlZddlmZmZmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZm Z  ddl!m"Z" e#e$Z%dZ&dZ'ddddgZ(dZ)dZ*ddddddgZ+dFe,e,ee, e,dddZ-G dd de
j.Z/G dd de
j.Z0G dd  d e
j.Z1G d!d" d"e
j.Z2G d#d$ d$e
j.Z3G d%d& d&e
j.Z4G d'd( d(e
j.Z5G d)d* d*e
j.Z6G d+d, d,e
j.Z7G d-d. d.e
j.Z8G d/d0 d0e
j.Z9G d1d2 d2e
j.Z:G d3d4 d4eZ;d5Z<d6Z=ed7e<G d8d9 d9e;Z>ed:e<G d;d< d<e;Z?G d=d> d>e
j.Z@G d?d@ d@e
j.ZAG dAdB dBe
j.ZBedCe<G dDdE dEe;ZCdS )Gz PyTorch MobileViT model.    N)DictOptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )MobileViTConfigr   zapple/mobilevit-smalli     ztabby, tabby catzapple/mobilevit-x-smallzapple/mobilevit-xx-smallzapple/deeplabv3-mobilevit-smallz!apple/deeplabv3-mobilevit-x-smallz"apple/deeplabv3-mobilevit-xx-small)valuedivisor	min_valuereturnc                 C   sF   |dkr|}t |t| |d  | | }|d|  k r>||7 }t|S )a  
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    N   g?)maxint)r   r   r   	new_value r$   o/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/mobilevit/modeling_mobilevit.pymake_divisibleI   s    r&   c                       sT   e Zd Zdeeeeeeeeeeeef dd fddZe	j
e	j
dd	d
Z  ZS )MobileViTConvLayerr   FTN)configin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   c                    s   t    t|d d | }|| dkr@td| d| d|| dkrbtd| d| dtj||||||||dd		| _|	rtj|d
dddd| _nd | _|
rt	|
t
rt|
 | _qt	|jt
rt|j | _q|j| _nd | _d S )Nr   r    r   zInput channels (z) are not divisible by z groups.zOutput channels (Zzeros)	r)   r*   r+   r,   paddingr/   r-   r.   Zpadding_modegh㈵>g?T)Znum_featuresepsZmomentumZaffineZtrack_running_stats)super__init__r"   
ValueErrorr   Conv2dconvolutionZBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   	__class__r$   r%   r5   Y   sB    



zMobileViTConvLayer.__init__featuresr   c                 C   s6   |  |}| jd k	r| |}| jd k	r2| |}|S N)r8   r9   r<   )r>   rB   r$   r$   r%   forward   s    




zMobileViTConvLayer.forward)r   r   Fr   TT)__name__
__module____qualname__r   r"   boolr   r;   r5   torchTensorrD   __classcell__r$   r$   r?   r%   r'   X   s(         
6r'   c                       sF   e Zd ZdZd
eeeeedd fddZejejddd	Z	  Z
S )MobileViTInvertedResidualzQ
    Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
    r   N)r(   r)   r*   r,   r/   r   c              	      s   t    ttt||j d}|dkr:td| d|dkoH||k| _t|||dd| _	t|||d|||d| _
t|||dd	d
| _d S )Nr   )r   r    zInvalid stride .r   r)   r*   r+   r   )r)   r*   r+   r,   r-   r/   Fr)   r*   r+   r1   )r4   r5   r&   r"   roundZexpand_ratior6   use_residualr'   
expand_1x1conv_3x3
reduce_1x1)r>   r(   r)   r*   r,   r/   Zexpanded_channelsr?   r$   r%   r5      s6    
   
z"MobileViTInvertedResidual.__init__rA   c                 C   s4   |}|  |}| |}| |}| jr0|| S |S rC   )rR   rS   rT   rQ   )r>   rB   residualr$   r$   r%   rD      s
    


z!MobileViTInvertedResidual.forward)r   )rE   rF   rG   __doc__r   r"   r5   rI   rJ   rD   rK   r$   r$   r?   r%   rL      s        !rL   c                       sB   e Zd Zd	eeeeedd fddZejejdddZ  Z	S )
MobileViTMobileNetLayerr   N)r(   r)   r*   r,   
num_stagesr   c                    sR   t    t | _t|D ]0}t||||dkr4|ndd}| j| |}qd S )Nr   r   )r)   r*   r,   )r4   r5   r   
ModuleListlayerrangerL   append)r>   r(   r)   r*   r,   rX   irZ   r?   r$   r%   r5      s    

z MobileViTMobileNetLayer.__init__rA   c                 C   s   | j D ]}||}q|S rC   rZ   )r>   rB   layer_moduler$   r$   r%   rD      s    

zMobileViTMobileNetLayer.forward)r   r   
rE   rF   rG   r   r"   r5   rI   rJ   rD   rK   r$   r$   r?   r%   rW      s          rW   c                       sN   e Zd Zeedd fddZejejdddZejejdd	d
Z	  Z
S )MobileViTSelfAttentionNr(   hidden_sizer   c                    s   t    ||j dkr2td|f d|j d|j| _t||j | _| j| j | _tj|| j|j	d| _
tj|| j|j	d| _tj|| j|j	d| _t|j| _d S )Nr   zThe hidden size z4 is not a multiple of the number of attention heads rM   )r.   )r4   r5   num_attention_headsr6   r"   attention_head_sizeall_head_sizer   LinearZqkv_biasquerykeyr   DropoutZattention_probs_dropout_probdropoutr>   r(   rc   r?   r$   r%   r5      s    
zMobileViTSelfAttention.__init__)xr   c                 C   s6   |  d d | j| jf }|j| }|ddddS )Nr   r    r   r   )sizerd   re   viewpermute)r>   rm   Znew_x_shaper$   r$   r%   transpose_for_scores   s    
z+MobileViTSelfAttention.transpose_for_scoreshidden_statesr   c           
      C   s   |  |}| | |}| | |}| |}t||dd}|t| j	 }t
jj|dd}| |}t||}|dddd }| d d | jf }	|j|	 }|S )Nrn   dimr   r    r   r   )rh   rr   ri   r   rI   matmul	transposemathsqrtre   r   
functionalZsoftmaxrk   rq   
contiguousro   rf   rp   )
r>   rt   Zmixed_query_layerZ	key_layerZvalue_layerZquery_layerZattention_scoresZattention_probsZcontext_layerZnew_context_layer_shaper$   r$   r%   rD      s    



zMobileViTSelfAttention.forward)rE   rF   rG   r   r"   r5   rI   rJ   rr   rD   rK   r$   r$   r?   r%   ra      s   ra   c                       s:   e Zd Zeedd fddZejejdddZ  Z	S )MobileViTSelfOutputNrb   c                    s*   t    t||| _t|j| _d S rC   r4   r5   r   rg   denserj   Zhidden_dropout_probrk   rl   r?   r$   r%   r5     s    
zMobileViTSelfOutput.__init__rs   c                 C   s   |  |}| |}|S rC   r   rk   r>   rt   r$   r$   r%   rD     s    

zMobileViTSelfOutput.forwardr`   r$   r$   r?   r%   r~     s   r~   c                       sN   e Zd Zeedd fddZee ddddZej	ej	dd	d
Z
  ZS )MobileViTAttentionNrb   c                    s.   t    t||| _t||| _t | _d S rC   )r4   r5   ra   	attentionr~   outputsetpruned_headsrl   r?   r$   r%   r5     s    
zMobileViTAttention.__init__)headsr   c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rv   )lenr   r   rd   re   r   r   rh   ri   r   r   r   rf   union)r>   r   indexr$   r$   r%   prune_heads%  s       zMobileViTAttention.prune_headsrs   c                 C   s   |  |}| |}|S rC   )r   r   )r>   rt   Zself_outputsattention_outputr$   r$   r%   rD   7  s    

zMobileViTAttention.forward)rE   rF   rG   r   r"   r5   r   r   rI   rJ   rD   rK   r$   r$   r?   r%   r     s   r   c                       s<   e Zd Zeeedd fddZejejdddZ  Z	S )MobileViTIntermediateNr(   rc   intermediate_sizer   c                    s>   t    t||| _t|jtr2t|j | _	n|j| _	d S rC   )
r4   r5   r   rg   r   r:   r=   r;   r   intermediate_act_fnr>   r(   rc   r   r?   r$   r%   r5   >  s
    
zMobileViTIntermediate.__init__rs   c                 C   s   |  |}| |}|S rC   )r   r   r   r$   r$   r%   rD   F  s    

zMobileViTIntermediate.forwardr`   r$   r$   r?   r%   r   =  s   r   c                       s@   e Zd Zeeedd fddZejejejdddZ  Z	S )MobileViTOutputNr   c                    s*   t    t||| _t|j| _d S rC   r   r   r?   r$   r%   r5   M  s    
zMobileViTOutput.__init__)rt   input_tensorr   c                 C   s    |  |}| |}|| }|S rC   r   )r>   rt   r   r$   r$   r%   rD   R  s    

zMobileViTOutput.forwardr`   r$   r$   r?   r%   r   L  s   r   c                       s<   e Zd Zeeedd fddZejejdddZ  Z	S )MobileViTTransformerLayerNr   c                    sZ   t    t||| _t|||| _t|||| _tj	||j
d| _tj	||j
d| _d S )Nr3   )r4   r5   r   r   r   intermediater   r   r   	LayerNormlayer_norm_epslayernorm_beforelayernorm_afterr   r?   r$   r%   r5   Z  s    
z"MobileViTTransformerLayer.__init__rs   c                 C   s<   |  | |}|| }| |}| |}| ||}|S rC   )r   r   r   r   r   )r>   rt   r   Zlayer_outputr$   r$   r%   rD   b  s    

z!MobileViTTransformerLayer.forwardr`   r$   r$   r?   r%   r   Y  s   r   c                       s<   e Zd Zeeedd fddZejejdddZ  Z	S )MobileViTTransformerN)r(   rc   rX   r   c                    sJ   t    t | _t|D ](}t||t||j d}| j	| qd S )N)rc   r   )
r4   r5   r   rY   rZ   r[   r   r"   Z	mlp_ratior\   )r>   r(   rc   rX   _transformer_layerr?   r$   r%   r5   m  s    

zMobileViTTransformer.__init__rs   c                 C   s   | j D ]}||}q|S rC   r^   )r>   rt   r_   r$   r$   r%   rD   y  s    

zMobileViTTransformer.forwardr`   r$   r$   r?   r%   r   l  s   r   c                
       s|   e Zd ZdZdeeeeeeedd fddZeje	eje
f ddd	Zeje
ejd
ddZejejdddZ  ZS )MobileViTLayerz;
    MobileViT block: https://arxiv.org/abs/2110.02178
    r   N)r(   r)   r*   r,   rc   rX   r/   r   c                    s   t    |j| _|j| _|dkrXt||||dkr6|nd|dkrH|d ndd| _|}nd | _t||||jd| _	t|||dddd| _
t|||d| _tj||jd| _t|||dd| _t|d| ||jd| _d S )	Nr    r   )r)   r*   r,   r/   rN   F)r)   r*   r+   r0   r1   )rc   rX   r   )r4   r5   Z
patch_sizepatch_widthpatch_heightrL   downsampling_layerr'   Zconv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projectionfusion)r>   r(   r)   r*   r,   rc   rX   r/   r?   r$   r%   r5     sZ    

	      zMobileViTLayer.__init__rA   c                 C   s  | j | j }}t|| }|j\}}}}tt|| | }	tt|| | }
d}|
|ksh|	|krtjj||	|
fddd}d}|
| }|	| }|| }|	|| | |||}|
dd}|	||||}|
dd}|	|| |d}||f||||||d	}||fS )
NFbilinearro   modeZalign_cornersTr   r    r   rn   )	orig_size
batch_sizechannelsinterpolatenum_patchesnum_patches_widthnum_patches_height)r   r   r"   shaperz   ceilr   r|   r   reshapery   )r>   rB   r   r   
patch_arear   r   Zorig_heightZ
orig_widthZ
new_heightZ	new_widthr   num_patch_widthnum_patch_heightr   patches	info_dictr$   r$   r%   	unfolding  sH       
   	zMobileViTLayer.unfolding)r   r   r   c                 C   s   | j | j }}t|| }|d }|d }|d }|d }	|d }
| |||d}|dd}||| |	 |
||}|dd	}||||	| |
| }|d
 rtjj	||d ddd}|S )Nr   r   r   r   r   rn   r   r   r    r   r   r   Fr   )
r   r   r"   r}   rp   ry   r   r   r|   r   )r>   r   r   r   r   r   r   r   r   r   r   rB   r$   r$   r%   folding  s<    
         zMobileViTLayer.foldingc                 C   s|   | j r|  |}|}| |}| |}| |\}}| |}| |}| ||}| |}| t	j
||fdd}|S Nr   rv   )r   r   r   r   r   r   r   r   r   rI   cat)r>   rB   rU   r   r   r$   r$   r%   rD     s    





zMobileViTLayer.forward)r   )rE   rF   rG   rV   r   r"   r5   rI   rJ   r   r   r   r   rD   rK   r$   r$   r?   r%   r     s    :+r   c                       sD   e Zd Zedd fddZd
ejeeee	e
f ddd	Z  ZS )MobileViTEncoderNr(   r   c           
   	      sZ  t    || _t | _d| _d }}|jdkr<d}d}n|jdkrJd}d}t||j	d |j	d ddd}| j
| t||j	d |j	d dd	d}| j
| t||j	d |j	d	 d|jd dd
}| j
| |r|d9 }t||j	d	 |j	d d|jd d|d}| j
| |r"|d9 }t||j	d |j	d d|jd d	|d}	| j
|	 d S )NFr   T   r   r   )r)   r*   r,   rX   r    r   )r)   r*   r,   rc   rX      )r)   r*   r,   rc   rX   r/      )r4   r5   r(   r   rY   rZ   gradient_checkpointingZoutput_striderW   neck_hidden_sizesr\   r   Zhidden_sizes)
r>   r(   Zdilate_layer_4Zdilate_layer_5r/   Zlayer_1Zlayer_2Zlayer_3Zlayer_4Zlayer_5r?   r$   r%   r5   !  sx    



		zMobileViTEncoder.__init__FT)rt   output_hidden_statesreturn_dictr   c                 C   s   |rdnd }t | jD ]H\}}| jrH| jrHdd }tjj|||}n||}|r||f }q|sztdd ||fD S t||dS )Nr$   c                    s    fdd}|S )Nc                     s    |  S rC   r$   )inputsmoduler$   r%   custom_forwardw  s    zOMobileViTEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr$   )r   r   r$   r   r%   create_custom_forwardv  s    z7MobileViTEncoder.forward.<locals>.create_custom_forwardc                 s   s   | ]}|d k	r|V  qd S rC   r$   ).0vr$   r$   r%   	<genexpr>  s      z+MobileViTEncoder.forward.<locals>.<genexpr>)last_hidden_statert   )		enumeraterZ   r   ZtrainingrI   utils
checkpointtupler   )r>   rt   r   r   Zall_hidden_statesr]   r_   r   r$   r$   r%   rD   k  s    zMobileViTEncoder.forward)FT)rE   rF   rG   r   r5   rI   rJ   rH   r   r   r   rD   rK   r$   r$   r?   r%   r      s   M  
r   c                   @   sJ   e Zd ZdZeZdZdZdZe	e
je
je
jf ddddZdd
dZdS )MobileViTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    	mobilevitpixel_valuesTN)r   r   c                 C   sj   t |tjtjfr@|jjjd| jjd |j	dk	rf|j	j
  n&t |tjrf|j	j
  |jjd dS )zInitialize the weightsg        )meanZstdNg      ?)r:   r   rg   r7   weightdataZnormal_r(   Zinitializer_ranger.   Zzero_r   Zfill_)r>   r   r$   r$   r%   _init_weights  s    
z&MobileViTPreTrainedModel._init_weightsFc                 C   s   t |tr||_d S rC   )r:   r   r   )r>   r   r   r$   r$   r%   _set_gradient_checkpointing  s    
z4MobileViTPreTrainedModel._set_gradient_checkpointing)F)rE   rF   rG   rV   r   config_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingr   r   rg   r7   r   r   r   r$   r$   r$   r%   r     s    r   aK  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MobileViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aF  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`MobileViTImageProcessor.__call__`] for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zWThe bare MobileViT model outputting raw hidden-states without any specific head on top.c                	       st   e Zd Zdeed fddZdd Zeee	e
eededdeej ee ee eeef d
ddZ  ZS )MobileViTModelT)r(   expand_outputc                    sn   t  | || _|| _t||j|jd ddd| _t|| _	| jrbt||jd |jd dd| _
|   d S )	Nr   r   r    )r)   r*   r+   r,   r      r   rN   )r4   r5   r(   r   r'   Znum_channelsr   	conv_stemr   encoderconv_1x1_exp	post_init)r>   r(   r   r?   r$   r%   r5     s&    
zMobileViTModel.__init__c                 C   sF   |  D ]8\}}| jj| }t|tr|jjD ]}|j| q.qdS )zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr   rZ   r:   r   r   r   r   )r>   Zheads_to_pruneZlayer_indexr   Zmobilevit_layerr   r$   r$   r%   _prune_heads  s
    
zMobileViTModel._prune_headsZvision)r   output_typer   Zmodalityexpected_outputN)r   r   r   r   c           	      C   s   |d k	r|n| j j}|d k	r |n| j j}|d kr8td| |}| j|||d}| jr|| |d }tj	|ddgdd}n|d }d }|s|d k	r||fn|f}||dd   S t
|||jd	S )
Nz You have to specify pixel_valuesr   r   r   ru   rn   F)rw   Zkeepdimr   )r   pooler_outputrt   )r(   r   use_return_dictr6   r   r   r   r   rI   r   r   rt   )	r>   r   r   r   Zembedding_outputZencoder_outputsr   pooled_outputr   r$   r$   r%   rD     s0    
zMobileViTModel.forward)T)NNN)rE   rF   rG   r   rH   r5   r   r   MOBILEVIT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   rI   rJ   r   r   rD   rK   r$   r$   r?   r%   r     s&   
	   
r   z
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    c                
       sp   e Zd Zedd fddZeeeee	e
edd	eej ee eej ee eee	f dddZ  ZS )
MobileViTForImageClassificationNr   c                    sd   t  | |j| _t|| _tj|jdd| _|jdkrNt	|j
d |jnt | _|   d S )NT)Zinplacer   rn   )r4   r5   
num_labelsr   r   r   rj   classifier_dropout_probrk   rg   r   ZIdentity
classifierr   r>   r(   r?   r$   r%   r5   #  s    
$z(MobileViTForImageClassification.__init__)r   r   r   r   )r   r   labelsr   r   c                 C   sr  |dk	r|n| j j}| j|||d}|r.|jn|d }| | |}d}|dk	r2| j jdkr| jdkrtd| j _n4| jdkr|jt	j
ks|jt	jkrd| j _nd| j _| j jdkrt }	| jdkr|	| | }n
|	||}nN| j jdkrt }	|	|d| j|d}n| j jdkr2t }	|	||}|sb|f|dd  }
|dk	r^|f|
 S |
S t|||jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   Z
regressionZsingle_label_classificationZmulti_label_classificationrn   r    )losslogitsrt   )r(   r   r   r   r   rk   Zproblem_typer   ZdtyperI   longr"   r
   Zsqueezer	   rp   r   r   rt   )r>   r   r   r   r   outputsr   r   r   loss_fctr   r$   r$   r%   rD   2  s>    


"


z'MobileViTForImageClassification.forward)NNNN)rE   rF   rG   r   r5   r   r   r   _IMAGE_CLASS_CHECKPOINTr   r   _IMAGE_CLASS_EXPECTED_OUTPUTr   rI   rJ   rH   r   r   rD   rK   r$   r$   r?   r%   r     s&       
r   c                       s<   e Zd Zeeedd fddZejejdddZ  Z	S )MobileViTASPPPoolingN)r(   r)   r*   r   c              	      s4   t    tjdd| _t|||ddddd| _d S )Nr   )Zoutput_sizeTrelu)r)   r*   r+   r,   r0   r1   )r4   r5   r   ZAdaptiveAvgPool2dglobal_poolr'   r   )r>   r(   r)   r*   r?   r$   r%   r5   q  s    
zMobileViTASPPPooling.__init__rA   c                 C   s:   |j dd  }| |}| |}tjj||ddd}|S )Nru   r   Fr   )r   r   r   r   r|   r   )r>   rB   Zspatial_sizer$   r$   r%   rD     s
    

zMobileViTASPPPooling.forwardr`   r$   r$   r?   r%   r   p  s   r   c                       s<   e Zd ZdZedd fddZejejdddZ  Z	S )	MobileViTASPPzs
    ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587
    Nr   c                    s   t     jd  jt jdkr0tdt | _	t
 ddd}| j	| | j	 fdd jD  t }| j	| t
 d	 ddd| _tj jd
| _d S )Nru   r   z"Expected 3 values for atrous_ratesr   r   rO   c              
      s    g | ]}t  d |ddqS )r   r   )r)   r*   r+   r/   r1   )r'   )r   Zrater(   r)   r*   r$   r%   
<listcomp>  s   	z*MobileViTASPP.__init__.<locals>.<listcomp>r   )p)r4   r5   r   aspp_out_channelsr   Zatrous_ratesr6   r   rY   convsr'   r\   extendr   projectrj   Zaspp_dropout_probrk   )r>   r(   Zin_projectionZ
pool_layerr?   r  r%   r5     s:    


	    zMobileViTASPP.__init__rA   c                 C   sD   g }| j D ]}||| q
tj|dd}| |}| |}|S r   )r  r\   rI   r   r  rk   )r>   rB   ZpyramidconvZpooled_featuresr$   r$   r%   rD     s    


zMobileViTASPP.forward
rE   rF   rG   rV   r   r5   rI   rJ   rD   rK   r$   r$   r?   r%   r    s   +r  c                       s<   e Zd ZdZedd fddZejejdddZ  Z	S )	MobileViTDeepLabV3zB
    DeepLabv3 architecture: https://arxiv.org/abs/1706.05587
    Nr   c              	      sB   t    t|| _t|j| _t||j	|j
ddddd| _d S )Nr   FT)r)   r*   r+   r0   r1   r.   )r4   r5   r  asppr   Z	Dropout2dr   rk   r'   r  r   r   r   r?   r$   r%   r5     s    

zMobileViTDeepLabV3.__init__rs   c                 C   s&   |  |d }| |}| |}|S )Nrn   )r  rk   r   )r>   rt   rB   r$   r$   r%   rD     s    

zMobileViTDeepLabV3.forwardr
  r$   r$   r?   r%   r    s   r  zX
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                
       sl   e Zd Zedd fddZeeeee	dd	e
ej e
ej e
e e
e eeef dddZ  ZS )
 MobileViTForSemanticSegmentationNr   c                    s8   t  | |j| _t|dd| _t|| _|   d S )NF)r   )r4   r5   r   r   r   r  segmentation_headr   r   r?   r$   r%   r5     s
    
z)MobileViTForSemanticSegmentation.__init__)r   r   )r   r   r   r   r   c                 C   s  |dk	r|n| j j}|dk	r |n| j j}| j|d|d}|rB|jn|d }| |}d}|dk	r| j jdkrvtdn6tj	j
||jdd ddd	}	t| j jd
}
|
|	|}|s|r|f|dd  }n|f|dd  }|dk	r|f| S |S t|||r|jndddS )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
        >>> model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```NTr   r   z/The number of labels should be greater than oneru   r   Fr   )Zignore_indexr    )r   r   rt   Z
attentions)r(   r   r   r   rt   r  r   r6   r   r|   r   r   r	   Zsemantic_loss_ignore_indexr   )r>   r   r   r   r   r   Zencoder_hidden_statesr   r   Zupsampled_logitsr   r   r$   r$   r%   rD     sB    '

   
z(MobileViTForSemanticSegmentation.forward)NNNN)rE   rF   rG   r   r5   r   r   r   r   r   r   rI   rJ   rH   r   r   rD   rK   r$   r$   r?   r%   r    s   

    
r  )r   N)DrV   rz   typingr   r   r   r   r   rI   Ztorch.utils.checkpointr   Ztorch.nnr   r	   r
   Zactivationsr   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   r   r   r   r   r   Zconfiguration_mobilevitr   Z
get_loggerrE   loggerr   r   r   r   r   Z'MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LISTr"   r&   Moduler'   rL   rW   ra   r~   r   r   r   r   r   r   r   r   ZMOBILEVIT_START_DOCSTRINGr   r   r   r   r  r  r  r$   r$   r$   r%   <module>   sx   
@03 "lWN;