U
    ,-e93                     @   s*  d Z ddlmZ ddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZmZmZmZ dd
lmZ ddlmZ dgZdZeG dd deZG dd deZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdZdZ edeG d d! d!eZ!dS )"z PyTorch ViTMatte model.    )	dataclass)OptionalTupleN)nn   )AutoBackbone)PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardreplace_return_docstrings)BackboneMixin   )VitMatteConfigz$hustvl/vitmatte-small-composition-1kr   c                   @   s^   e Zd ZU dZdZeej ed< dZ	ejed< dZ
eeej  ed< dZeeej  ed< dS )ImageMattingOutputa  
    Class for outputs of image matting models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Loss.
        alphas (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
           Estimated alpha values.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
            (also called feature maps) of the model at the output of each stage.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossalphashidden_states
attentions)__name__
__module____qualname____doc__r   r   torchZFloatTensor__annotations__r   r   r   r    r   r   o/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/vitmatte/modeling_vitmatte.pyr   -   s
   
r   c                   @   s.   e Zd ZdZeZdZdZdd Zd
ddZ	d	S )VitMattePreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    pixel_valuesTc                 C   s<   t |tjr8|jjjd| jjd |jd k	r8|jj	  d S )Ng        )ZmeanZstd)

isinstancer   Conv2dweightdataZnormal_configZinitializer_rangebiasZzero_)selfmoduler   r   r   _init_weightsS   s    
z%VitMattePreTrainedModel._init_weightsFc                 C   s   t |tr||_d S N)r   r   Zgradient_checkpointing)r%   r&   valuer   r   r   _set_gradient_checkpointingY   s    
z3VitMattePreTrainedModel._set_gradient_checkpointingN)F)
r   r   r   r   r   config_classZmain_input_nameZsupports_gradient_checkpointingr'   r*   r   r   r   r   r   I   s   r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	VitMatteBasicConv3x3zP
    Basic convolution layers including: Conv3x3, BatchNorm2d, ReLU layers.
       r   c                    sB   t    tj||d||dd| _tj||jd| _t | _	d S )Nr   F)in_channelsout_channelskernel_sizestridepaddingr$   )Zeps)
super__init__r   r    convBatchNorm2dZbatch_norm_eps
batch_normReLUrelu)r%   r#   r.   r/   r1   r2   	__class__r   r   r4   c   s    
zVitMatteBasicConv3x3.__init__c                 C   s"   |  |}| |}| |}|S r(   )r5   r7   r9   r%   Zhidden_stater   r   r   forwardp   s    


zVitMatteBasicConv3x3.forward)r-   r   r   r   r   r   r4   r=   __classcell__r   r   r:   r   r,   ^   s   r,   c                       s(   e Zd ZdZ fddZdd Z  ZS )VitMatteConvStreamzc
    Simple ConvStream containing a series of basic conv3x3 layers to extract detail features.
    c                    sv   t    |jj}|j}t | _|g| | _t	t
| jd D ]0}| j| }| j|d  }| jt||| q@d S )Nr   )r3   r4   backbone_configZnum_channelsconvstream_hidden_sizesr   
ModuleListconvs
conv_chansrangelenappendr,   )r%   r#   r.   r/   iZin_chan_Z	out_chan_r:   r   r   r4   }   s    


zVitMatteConvStream.__init__c                 C   sJ   d|i}|}t t| jD ]*}| j| |}dt|d  }|||< q|S )NZdetailed_feature_map_0detailed_feature_map_r   )rF   rG   rD   str)r%   r   Zout_dictZ
embeddingsrI   Zname_r   r   r   r=      s    
zVitMatteConvStream.forwardr>   r   r   r:   r   r@   x   s   r@   c                       s(   e Zd ZdZ fddZdd Z  ZS )VitMatteFusionBlockz\
    Simple fusion block to fuse features from ConvStream and Plain Vision Transformer.
    c                    s"   t    t|||ddd| _d S )Nr   )r1   r2   )r3   r4   r,   r5   )r%   r#   r.   r/   r:   r   r   r4      s    
zVitMatteFusionBlock.__init__c                 C   s4   t jj|dddd}tj||gdd}| |}|S )Nr-   ZbilinearF)Zscale_factormodeZalign_cornersr   )dim)r   Z
functionalZinterpolater   catr5   )r%   featuresZdetailed_feature_mapZupscaled_featuresoutr   r   r   r=      s    
zVitMatteFusionBlock.forwardr>   r   r   r:   r   rL      s   rL   c                       s(   e Zd ZdZ fddZdd Z  ZS )VitMatteHeadzJ
    Simple Matting Head, containing only conv3x3 and conv1x1 layers.
    c                    sZ   t    |jd }d}ttj||ddddt|tdtj|ddddd| _d S )N   r   r   )r0   r1   r2   Tr   )	r3   r4   fusion_hidden_sizesr   Z
Sequentialr    r6   r8   matting_convs)r%   r#   r.   Zmid_channelsr:   r   r   r4      s    

zVitMatteHead.__init__c                 C   s   |  |}|S r(   )rV   r<   r   r   r   r=      s    
zVitMatteHead.forwardr>   r   r   r:   r   rR      s   rR   c                       s(   e Zd ZdZ fddZdd Z  ZS )VitMatteDetailCaptureModulezG
    Simple and lightweight Detail Capture Module for ViT Matting.
    c              	      s   t    t|jt|jd kr*td|| _t|| _| jj	| _	t
 | _|jg|j | _tt| jd D ]:}| jt|| j| | j	|d    | j|d  d qpt|| _d S )Nr   z_The length of fusion_hidden_sizes should be equal to the length of convstream_hidden_sizes + 1.)r#   r.   r/   )r3   r4   rG   rU   rB   
ValueErrorr#   r@   
convstreamrE   r   rC   fusion_blocksZhidden_sizeZfusion_channelsrF   rH   rL   rR   matting_head)r%   r#   rI   r:   r   r   r4      s&    



z$VitMatteDetailCaptureModule.__init__c                 C   s`   |  |}tt| jD ]2}dtt| j| d  }| j| ||| }qt| |}|S )NrJ   r   )rY   rF   rG   rZ   rK   r   Zsigmoidr[   )r%   rP   r   Zdetail_featuresrI   Zdetailed_feature_map_namer   r   r   r   r=      s    
z#VitMatteDetailCaptureModule.forwardr>   r   r   r:   r   rW      s   rW   aI  
    Parameters:
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.
        config ([`UperNetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aw  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`VitMatteImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers in case the backbone has them. See
            `attentions` under returned tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers of the backbone. See `hidden_states` under
            returned tensors for more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zNViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes.c                	       sf   e Zd Z fddZeedeee	dd	e
ej e
e e
e e
ej e
e dddZ  ZS )
VitMatteForImageMattingc                    s6   t  | || _t|j| _t|| _| 	  d S r(   )
r3   r4   r#   r   from_configrA   backbonerW   decoder	post_init)r%   r#   r:   r   r   r4     s
    
z VitMatteForImageMatting.__init__zbatch_size, sequence_length)output_typer+   N)r   output_attentionsoutput_hidden_stateslabelsreturn_dictc                 C   s   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| jj|||d}|jd }| ||}d}	|dk	rxtd|s|f|dd  }
|	dk	r|	f|
 S |
S t	|	||j
|jdS )aJ  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth image matting for computing the loss.

        Returns:

        Examples:

        ```python
        >>> from transformers import VitMatteImageProcessor, VitMatteForImageMatting
        >>> import torch
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download

        >>> processor = VitMatteImageProcessor.from_pretrained("hustvl/vitmatte-small-composition-1k")
        >>> model = VitMatteForImageMatting.from_pretrained("hustvl/vitmatte-small-composition-1k")

        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="image.png", repo_type="dataset"
        ... )
        >>> image = Image.open(filepath).convert("RGB")
        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="trimap.png", repo_type="dataset"
        ... )
        >>> trimap = Image.open(filepath).convert("L")

        >>> # prepare image + trimap for the model
        >>> inputs = processor(images=image, trimaps=trimap, return_tensors="pt")

        >>> with torch.no_grad():
        ...     alphas = model(**inputs).alphas
        >>> print(alphas.shape)
        torch.Size([1, 1, 640, 960])
        ```N)rc   rb   rS   zTraining is not yet supportedr   )r   r   r   r   )r#   Zuse_return_dictrc   rb   r^   Zforward_with_filtered_kwargsZfeature_mapsr_   NotImplementedErrorr   r   r   )r%   r   rb   rc   rd   re   outputsrP   r   r   outputr   r   r   r=     s.    ,  
zVitMatteForImageMatting.forward)NNNNN)r   r   r   r4   r   VITMATTE_INPUTS_DOCSTRINGformatr   r   _CONFIG_FOR_DOCr   r   ZTensorboolr=   r?   r   r   r:   r   r\     s   

     r\   )"r   dataclassesr   typingr   r   r   r    r   Zmodeling_utilsr   utilsr	   r
   r   r   Zutils.backbone_utilsr   Zconfiguration_vitmatter   Z&VITMATTE_PRETRAINED_MODEL_ARCHIVE_LISTrk   r   r   Moduler,   r@   rL   rR   rW   ZVITMATTE_START_DOCSTRINGri   r\   r   r   r   r   <module>   s6   )
