U
    9%e]                     @   s  d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZ ddlmZ ee Z!dZ"dZ#ddddgZ$dZ%dZ&dgZ'd3ej(e)e*ej(dddZ+G dd dej,Z-G dd dej,Z.G dd dej,Z/G dd dej,Z0G d d! d!ej,Z1G d"d# d#ej,Z2G d$d% d%ej,Z3G d&d' d'eZ4d(Z5d)Z6ed*e5G d+d, d,e4Z7ed-e5G d.d/ d/e4Z8ed0e5G d1d2 d2e4eZ9dS )4z PyTorch ConvNextV2 model.    )OptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)BackboneMixin   )ConvNextV2Configr   zfacebook/convnextv2-tiny-1k-224i      ztabby, tabby cat        F)input	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchZrandr   r   Zfloor_div)r   r   r   Z	keep_probr    Zrandom_tensoroutput r%   q/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/convnextv2/modeling_convnextv2.py	drop_path@   s    
r'   c                       sP   e Zd ZdZdee dd fddZejejdddZ	e
d	d
dZ  ZS )ConvNextV2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).N)r   r   c                    s   t    || _d S N)super__init__r   )selfr   	__class__r%   r&   r+   X   s    
zConvNextV2DropPath.__init__hidden_statesr   c                 C   s   t || j| jS r)   )r'   r   r   r,   r0   r%   r%   r&   forward\   s    zConvNextV2DropPath.forward)r   c                 C   s   d | jS )Nzp={})formatr   )r,   r%   r%   r&   
extra_repr_   s    zConvNextV2DropPath.extra_repr)N)__name__
__module____qualname____doc__r   floatr+   r"   Tensorr2   strr4   __classcell__r%   r%   r-   r&   r(   U   s   r(   c                       s:   e Zd ZdZed fddZejejdddZ  Z	S )ConvNextV2GRNz)GRN (Global Response Normalization) layer)dimc                    s>   t    ttddd|| _ttddd|| _d S )Nr   )r*   r+   r   	Parameterr"   zerosweightbias)r,   r>   r-   r%   r&   r+   f   s    
zConvNextV2GRN.__init__r/   c                 C   sD   t j|dddd}||jdddd  }| j||  | j | }|S )N   )r   rC   T)pr>   keepdim)r>   rE   ư>)r"   ZnormmeanrA   rB   )r,   r0   Zglobal_featuresZnorm_featuresr%   r%   r&   r2   k   s    zConvNextV2GRN.forward)
r5   r6   r7   r8   intr+   r"   FloatTensorr2   r<   r%   r%   r-   r&   r=   c   s   r=   c                       s6   e Zd ZdZd	 fdd	ZejejdddZ  ZS )
ConvNextV2LayerNormaA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    rG   channels_lastc                    s`   t    tt|| _tt|| _|| _	|| _
| j
dkrTtd| j
 |f| _d S )N)rL   channels_firstzUnsupported data format: )r*   r+   r   r?   r"   ZonesrA   r@   rB   epsdata_formatNotImplementedErrornormalized_shape)r,   rQ   rN   rO   r-   r%   r&   r+   {   s    

zConvNextV2LayerNorm.__init__)xr   c                 C   s   | j dkr*tjj|| j| j| j| j}n| j dkr|j	}|
 }|jddd}|| djddd}|| t|| j  }|j|d}| jd d d d f | | jd d d d f  }|S )NrL   rM   r   T)rE   rC   )r   )rO   r"   r   Z
functionalZ
layer_normrQ   rA   rB   rN   r   r9   rH   powsqrtto)r,   rR   Zinput_dtypeusr%   r%   r&   r2      s    
 
,zConvNextV2LayerNorm.forward)rG   rL   )	r5   r6   r7   r8   r+   r"   r:   r2   r<   r%   r%   r-   r&   rK   u   s   
rK   c                       s4   e Zd ZdZ fddZejejdddZ  Z	S )ConvNextV2EmbeddingszThis class is comparable to (and inspired by) the SwinEmbeddings class
    found in src/transformers/models/swin/modeling_swin.py.
    c                    sL   t    tj|j|jd |j|jd| _t|jd ddd| _	|j| _d S )Nr   kernel_sizestriderG   rM   rN   rO   )
r*   r+   r   Conv2dnum_channelshidden_sizesZ
patch_sizepatch_embeddingsrK   	layernormr,   configr-   r%   r&   r+      s    
   zConvNextV2Embeddings.__init__)pixel_valuesr   c                 C   s4   |j d }|| jkrtd| |}| |}|S )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)r    r^   
ValueErrorr`   ra   )r,   rd   r^   
embeddingsr%   r%   r&   r2      s    



zConvNextV2Embeddings.forward
r5   r6   r7   r8   r+   r"   rJ   r:   r2   r<   r%   r%   r-   r&   rX      s   rX   c                       s6   e Zd ZdZd fdd	ZejejdddZ  Z	S )	ConvNextV2Layera5  This corresponds to the `Block` class in the original implementation.

    There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
    H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back

    The authors used (2) as they find it slightly faster in PyTorch.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        dim (`int`): Number of input channels.
        drop_path (`float`): Stochastic depth rate. Default: 0.0.
    r   c                    s   t    tj||dd|d| _t|dd| _t|d| | _t	|j
 | _td| | _td| || _|dkr|t|nt | _d S )Nr   r	   )rZ   paddinggroupsrG   rN      r   )r*   r+   r   r]   dwconvrK   ra   Linearpwconv1r
   Z
hidden_actactr=   grnpwconv2r(   Identityr'   )r,   rc   r>   r'   r-   r%   r&   r+      s    
zConvNextV2Layer.__init__r/   c                 C   sr   |}|  |}|dddd}| |}| |}| |}| |}| |}|dddd}|| | }|S )Nr   rC   r	   r   )rm   Zpermutera   ro   rp   rq   rr   r'   )r,   r0   r   rR   r%   r%   r&   r2      s    





zConvNextV2Layer.forward)r   rg   r%   r%   r-   r&   rh      s   rh   c                       s6   e Zd ZdZd	 fdd	ZejejdddZ  Z	S )
ConvNextV2Stagea  ConvNeXTV2 stage, consisting of an optional downsampling layer + multiple residual blocks.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        in_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        depth (`int`): Number of residual blocks.
        drop_path_rates(`List[float]`): Stochastic depth rates for each layer.
    rC   Nc              	      s   t    |ks|dkrBtt|dddtj|||d| _n
t | _pXdg| tj fddt|D  | _	d S )	Nr   rG   rM   r\   rY   r   c                    s   g | ]}t  | d qS ))r>   r'   )rh   ).0jrc   drop_path_ratesout_channelsr%   r&   
<listcomp>   s     z,ConvNextV2Stage.__init__.<locals>.<listcomp>)
r*   r+   r   Z
SequentialrK   r]   downsampling_layerrs   rangelayers)r,   rc   in_channelsry   rZ   r[   depthrx   r-   rw   r&   r+      s    

zConvNextV2Stage.__init__r/   c                 C   s   |  |}| |}|S r)   )r{   r}   r1   r%   r%   r&   r2      s    

zConvNextV2Stage.forward)rC   rC   rC   Nrg   r%   r%   r-   r&   rt      s   
rt   c                       sD   e Zd Z fddZdejee ee ee	e
f dddZ  ZS )	ConvNextV2Encoderc              	      s   t    t | _dd td|jt|j	
|j	D }|jd }t|jD ]H}|j| }t||||dkrrdnd|j	| || d}| j| |}qPd S )Nc                 S   s   g | ]}|  qS r%   )tolist)ru   rR   r%   r%   r&   rz      s    z.ConvNextV2Encoder.__init__.<locals>.<listcomp>r   rC   r   )r~   ry   r[   r   rx   )r*   r+   r   Z
ModuleListstagesr"   ZlinspaceZdrop_path_ratesumZdepthssplitr_   r|   Z
num_stagesrt   append)r,   rc   rx   Zprev_chsiZout_chsstager-   r%   r&   r+      s$    



zConvNextV2Encoder.__init__FT)r0   output_hidden_statesreturn_dictr   c                 C   sj   |rdnd }t | jD ]\}}|r,||f }||}q|rD||f }|s^tdd ||fD S t||dS )Nr%   c                 s   s   | ]}|d k	r|V  qd S r)   r%   )ru   vr%   r%   r&   	<genexpr>   s      z,ConvNextV2Encoder.forward.<locals>.<genexpr>)last_hidden_stater0   )	enumerater   tupler   )r,   r0   r   r   Zall_hidden_statesr   Zlayer_moduler%   r%   r&   r2     s    


zConvNextV2Encoder.forward)FT)r5   r6   r7   r+   r"   rJ   r   boolr   r   r   r2   r<   r%   r%   r-   r&   r      s     
r   c                   @   s2   e Zd ZdZeZdZdZdZdd Z	ddd	Z
d
S )ConvNextV2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    
convnextv2rd   Tc                 C   sj   t |tjtjfr@|jjjd| jjd |j	dk	rf|j	j
  n&t |tjrf|j	j
  |jjd dS )zInitialize the weightsr   )rH   ZstdNg      ?)
isinstancer   rn   r]   rA   dataZnormal_rc   Zinitializer_rangerB   Zzero_	LayerNormZfill_)r,   moduler%   r%   r&   _init_weights4  s    
z'ConvNextV2PreTrainedModel._init_weightsFc                 C   s   t |tr||_d S r)   )r   r   Zgradient_checkpointing)r,   r   valuer%   r%   r&   _set_gradient_checkpointing@  s    
z5ConvNextV2PreTrainedModel._set_gradient_checkpointingN)F)r5   r6   r7   r8   r   config_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingr   r   r%   r%   r%   r&   r   )  s   r   aL  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`ConvNextV2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aI  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`ConvNextImageProcessor`]. See
            [`ConvNextImageProcessor.__call__`] for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zSThe bare ConvNextV2 model outputting raw features without any specific head on top.c                	       s^   e Zd Z fddZeeeeee	de
dd	ejee ee eeef dddZ  ZS )
ConvNextV2Modelc                    sJ   t  | || _t|| _t|| _tj|j	d |j
d| _|   d S )NrF   rk   )r*   r+   rc   rX   rf   r   encoderr   r   r_   Zlayer_norm_epsra   	post_initrb   r-   r%   r&   r+   c  s    

zConvNextV2Model.__init__Zvision)
checkpointoutput_typer   Zmodalityexpected_outputNrd   r   r   r   c                 C   s   |d k	r|n| j j}|d k	r |n| j j}|d kr8td| |}| j|||d}|d }| |ddg}|s||f|dd   S t|||j	dS )Nz You have to specify pixel_valuesr   r   r   rF   r   )r   pooler_outputr0   )
rc   r   use_return_dictre   rf   r   ra   rH   r   r0   )r,   rd   r   r   embedding_outputZencoder_outputsr   pooled_outputr%   r%   r&   r2   p  s(    
zConvNextV2Model.forward)NNN)r5   r6   r7   r+   r   CONVNEXTV2_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr"   rJ   r   r   r   r   r2   r<   r%   r%   r-   r&   r   ]  s$   	   
r   z
    ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    c                
       sd   e Zd Z fddZeeeeee	e
ddejeej ee ee eeef dddZ  ZS )	 ConvNextV2ForImageClassificationc                    sR   t  | |j| _t|| _|jdkr<t|jd |jnt | _	| 
  d S )Nr   rF   )r*   r+   
num_labelsr   r   r   rn   r_   rs   
classifierr   rb   r-   r%   r&   r+     s    
$z)ConvNextV2ForImageClassification.__init__)r   r   r   r   N)rd   labelsr   r   r   c                 C   sl  |dk	r|n| j j}| j|||d}|r.|jn|d }| |}d}|dk	r,| j jdkr| jdkrnd| j _n4| jdkr|jtj	ks|jtj
krd| j _nd| j _| j jdkrt }	| jdkr|	| | }n
|	||}nN| j jdkrt }	|	|d| j|d}n| j jdkr,t }	|	||}|s\|f|dd  }
|dk	rX|f|
 S |
S t|||jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   Z
regressionZsingle_label_classificationZmulti_label_classificationrF   rC   )losslogitsr0   )rc   r   r   r   r   Zproblem_typer   r   r"   longrI   r   Zsqueezer   viewr   r   r0   )r,   rd   r   r   r   outputsr   r   r   Zloss_fctr$   r%   r%   r&   r2     s>    



"


z(ConvNextV2ForImageClassification.forward)NNNN)r5   r6   r7   r+   r   r   r   _IMAGE_CLASS_CHECKPOINTr   r   _IMAGE_CLASS_EXPECTED_OUTPUTr"   rJ   r   Z
LongTensorr   r   r   r2   r<   r%   r%   r-   r&   r     s&   	    
r   zT
    ConvNeXT V2 backbone, to be used with frameworks like DETR and MaskFormer.
    c                       sP   e Zd Z fddZeeeeedde	j
ee ee edddZ  ZS )	ConvNextV2Backbonec                    s   t  | t  | t|| _t|| _|jd g|j | _i }t	| j
| jD ]\}}t|dd||< qRt|| _|   d S )Nr   rM   )rO   )r*   r+   Z_init_backbonerX   rf   r   r   r_   Znum_featureszipZ_out_featuresZchannelsrK   r   Z
ModuleDicthidden_states_normsr   )r,   rc   r   r   r^   r-   r%   r&   r+     s    

zConvNextV2Backbone.__init__)r   r   Nr   c                 C   s   |dk	r|n| j j}|dk	r |n| j j}| |}| j|ddd}|j}d}tt| jdd |dd D ].\}\}	}
|	| j	krl| j
|	 |
}
||
f7 }ql|s|f}|r||jf7 }|S t||r|jndddS )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224")
        >>> model = AutoBackbone.from_pretrained("facebook/convnextv2-tiny-1k-224")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr   r%   r   )feature_mapsr0   Z
attentions)rc   r   r   rf   r   r0   r   r   Zstage_namesZout_featuresr   r   )r,   rd   r   r   r   r   r0   r   idxr   Zhidden_stater$   r%   r%   r&   r2   
  s2    
,
zConvNextV2Backbone.forward)NN)r5   r6   r7   r+   r   r   r   r   r   r"   r:   r   r   r2   r<   r%   r%   r-   r&   r     s   
  r   )r   F):r8   typingr   r   r   r"   Ztorch.utils.checkpointr   Ztorch.nnr   r   r   Zactivationsr
   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   utilsr   r   r   r   r   Zutils.backbone_utilsr   Zconfiguration_convnextv2r   Z
get_loggerr5   loggerr   r   r   r   r   Z(CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LISTr:   r9   r   r'   Moduler(   r=   rK   rX   rh   rt   r   r   ZCONVNEXTV2_START_DOCSTRINGr   r   r   r   r%   r%   r%   r&   <module>   sZ   
,!0;L