U
    ,-e                    @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZmZmZ eeZ dZ!dgZ"dCe
j#e
j$ee% dddZ&e
j#e
j#dddZ'e
j#e
j#dddZ(eG dd deZ)eG dd deZ*eG dd deZ+G dd dej,Z-G dd  d ej,Z.G d!d" d"ej,Z/G d#d$ d$ej,Z0G d%d& d&ej,Z1G d'd( d(eZ2d)Z3d*Z4d+Z5d,Z6G d-d. d.ej,Z7dDe
j8e
j$e
j9e%d/d0d1Z:G d2d3 d3ej,Z;G d4d5 d5e2Z<G d6d7 d7ej,Z=G d8d9 d9e2Z>ee3G d:d; d;e2Z?G d<d= d=ej,Z@G d>d? d?e2ZAed@e3G dAdB dBe2ZBdS )Ez PyTorch CLIPSeg model.    N)	dataclass)AnyOptionalTupleUnion)nn   )ACT2FN)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )CLIPSegConfigCLIPSegTextConfigCLIPSegVisionConfigzCIDAS/clipseg-rd64-refined)maskdtypetgt_lenc                 C   sj   |   \}}|dk	r|n|}| ddddddf |d|||}d| }||tjt|jS )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    Nr         ?)sizeexpandtoZmasked_filltorchboolfinfomin)r   r   r   bszsrc_lenZexpanded_maskZinverted_mask r#   m/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/clipseg/modeling_clipseg.py_expand_mask3   s
    *r%   )logitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)r   
functionalZcross_entropyr   arangelenr)   )r&   r#   r#   r$   contrastive_lossC   s    r-   )
similarityr'   c                 C   s    t | }t |  }|| d S )Ng       @)r-   t)r.   Zcaption_lossZ
image_lossr#   r#   r$   clipseg_lossH   s    r0   c                   @   s   e Zd ZU dZdZeej ed< dZ	ejed< dZ
ejed< dZejed< dZejed< dZeed< dZeed	< ee d
ddZdS )CLIPSegOutputa  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`CLIPSegVisionModel`].
        text_model_output(`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegTextModel`].
        vision_model_output(`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr'   c                    s   t  fdd  D S )Nc                 3   s,   | ]$}|d kr | nt  | V  qdS ))r7   r8   Ngetattrto_tuple.0kselfr#   r$   	<genexpr>o   s   z)CLIPSegOutput.to_tuple.<locals>.<genexpr>tuplekeysr@   r#   r@   r$   r<   n   s    zCLIPSegOutput.to_tuple)__name__
__module____qualname____doc__r2   r   r   FloatTensor__annotations__r3   r4   r5   r6   r7   r   r8   r   r   r<   r#   r#   r#   r$   r1   N   s   
r1   c                   @   sL   e Zd ZU dZdZejed< dZe	e
ej  ed< dZe	e
ej  ed< dS )CLIPSegDecoderOutputa  
    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
            Classification scores for each pixel.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nr&   hidden_states
attentions)rF   rG   rH   rI   r&   r   rJ   rK   rM   r   r   rN   r#   r#   r#   r$   rL   u   s   
rL   c                   @   sx   e Zd ZU dZdZeej ed< dZ	ejed< dZ
ejed< dZejed< dZeed< dZeed< ee d	d
dZdS )CLIPSegImageSegmentationOutputa,  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        ...
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegVisionModel`].
    Nr2   r&   conditional_embeddingspooled_outputr8   decoder_outputr9   c                    s   t  fdd  D S )Nc                 3   s,   | ]$}|d kr | nt  | V  qdS ))r8   rR   Nr:   r=   r@   r#   r$   rB      s   z:CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>rC   r@   r#   r@   r$   r<      s    z'CLIPSegImageSegmentationOutput.to_tuple)rF   rG   rH   rI   r2   r   r   rJ   rK   r&   rP   rQ   r8   r   rR   rL   r   r   r<   r#   r#   r#   r$   rO      s   
	rO   c                       s>   e Zd Zed fddZdd ZejejdddZ	  Z
S )	CLIPSegVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)Zin_channelsZout_channelskernel_sizestridebias   r   position_idsr   
persistent)super__init__rU   hidden_size	embed_dimZ
image_size
patch_sizer   	Parameterr   Zrandnclass_embeddingConv2dZnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr+   r   rA   rU   	__class__r#   r$   r`      s"    
z CLIPSegVisionEmbeddings.__init__c                 C   s   t |dkrtdt| jd }| jjdd  jd| jj	||}t
jj||dddd| jj	|d |d  j}t| jjd d |g}|S )	NrY   z#new_size should consist of 2 valuesg      ?r   ZbicubicF)modeZalign_cornersr   )r,   
ValueErrorintrh   rk   weightTviewrU   ra   r   r*   Zinterpolatesqueezer   cat)rA   new_sizeZnum_patches_one_directionabresultr#   r#   r$   interpolate_position_embeddings   s$        z7CLIPSegVisionEmbeddings.interpolate_position_embeddings)pixel_valuesr'   c                 C   s   |j d }| |}|ddd}| j|dd}tj||gdd}|j d | jkrt	t
|j d d }|| ||f }||j}n|| | j }|S )Nr   rY   r   r\   dim)shaperg   flatten	transposere   r   r   rw   ri   rr   mathsqrtr|   r   r   rk   rZ   )rA   r}   
batch_sizeZpatch_embedsZclass_embeds
embeddingsZ	new_shaper#   r#   r$   forward   s    

zCLIPSegVisionEmbeddings.forward)rF   rG   rH   r   r`   r|   r   rJ   Tensorr   __classcell__r#   r#   rn   r$   rS      s   rS   c                       sL   e Zd Zed fddZdeej eej eej ej	dddZ
  ZS )	CLIPSegTextEmbeddingsrT   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )NrZ   r[   Fr]   )r_   r`   ra   r   rj   Z
vocab_sizetoken_embeddingZmax_position_embeddingsrk   rl   r   r+   r   rA   rU   rb   rn   r#   r$   r`      s    
  zCLIPSegTextEmbeddings.__init__N)	input_idsrZ   inputs_embedsr'   c                 C   sb   |d k	r|j d n|j d }|d kr:| jd d d |f }|d krL| |}| |}|| }|S )Nr\   )r   rZ   r   rk   )rA   r   rZ   r   Z
seq_lengthZposition_embeddingsr   r#   r#   r$   r      s    

zCLIPSegTextEmbeddings.forward)NNN)rF   rG   rH   r   r`   r   r   
LongTensorrJ   r   r   r   r#   r#   rn   r$   r      s      r   c                       sz   e Zd ZdZ fddZejeedddZdeje	ej e	ej e	e
 eeje	ej e	eej  f d	d
dZ  ZS )CLIPSegAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkrZtd| j d| j d| jd | _	|j
| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      )r_   r`   rU   ra   rb   num_attention_heads	num_headshead_dimrq   scaleZattention_dropoutdropoutr   Lineark_projv_projq_projout_projrm   rn   r#   r$   r`     s    
zCLIPSegAttention.__init__)tensorseq_lenr!   c                 C   s    | ||| j| jdd S )Nr   rY   )ru   r   r   r   
contiguous)rA   r   r   r!   r#   r#   r$   _shape  s    zCLIPSegAttention._shapeNFrM   attention_maskcausal_attention_maskoutput_attentionsr'   c                 C   s  |  \}}}| || j }| | |d|}	| | |d|}
|| j d| jf}| |||j| }|	j| }	|
j| }
|	 d}t	
||	dd}|  || j ||fkrtd|| j ||f d|   |dk	rD|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}|dk	r|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}tjj|dd}|r||| j||}||| j ||}nd}tjj|| j| jd	}t	
||
}|  || j || jfkrRtd
|| j|| jf d|   ||| j|| j}|dd}||||}| |}||fS )z#Input shape: Batch x Time x Channelr\   r   rY   z$Attention weights should be of size z	, but is Nz!Attention mask should be of size r~   )ptrainingz `attn_output` should be of size )r   r   r   r   r   r   r   r   ru   r   Zbmmr   rq   r   r*   Zsoftmaxr   r   Zreshaper   )rA   rM   r   r   r   r!   r   rb   Zquery_statesZ
key_statesZvalue_statesZ
proj_shaper"   attn_weightsZattn_weights_reshapedZ
attn_probsZattn_outputr#   r#   r$   r     sX    	





zCLIPSegAttention.forward)NNF)rF   rG   rH   rI   r`   r   r   rr   r   r   r   r   r   r   r#   r#   rn   r$   r     s      r   c                       s0   e Zd Z fddZejejdddZ  ZS )
CLIPSegMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)r_   r`   rU   r	   
hidden_actactivation_fnr   r   ra   intermediate_sizefc1fc2rm   rn   r#   r$   r`   l  s
    
zCLIPSegMLP.__init__)rM   r'   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )rA   rM   r#   r#   r$   r   s  s    


zCLIPSegMLP.forward)rF   rG   rH   r`   r   r   r   r   r#   r#   rn   r$   r   k  s   r   c                       sJ   e Zd Zed fddZdejejejee e	ej
 dddZ  ZS )	CLIPSegEncoderLayerrT   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S N)Zepsr_   r`   ra   rb   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rm   rn   r#   r$   r`   |  s    


zCLIPSegEncoderLayer.__init__Fr   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r`||f7 }|S aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rM   r   r   r   )r   r   r   r   rA   rM   r   r   r   Zresidualr   outputsr#   r#   r$   r     s"    




zCLIPSegEncoderLayer.forward)F)rF   rG   rH   r   r`   r   r   r   r   r   rJ   r   r   r#   r#   rn   r$   r   {  s    r   c                   @   s.   e Zd ZdZeZdZdZdd Zd
ddZ	d	S )CLIPSegPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    clipTc                 C   sX  | j j}t|trF|jjjjd|d d |jjjjd|d d nt|t	r| j j}t
jj|jd|jd | d t
jj|jj|j j| d t
jj|jj|j j| d nTt|trD| j j}|jd d|j j d  | }|jd | }t
jj|jj|d t
jj|jj|d t
jj|jj|d t
jj|jj|d nt|tr| j j}|j jd d|j j d  | }d|j j d | }t
jj|jj|d t
jj|jj|d nPt|trt
jj|jj|jd | j j d t
jj|jj|jd | j j d t|t
jr.|j j!  |jj"d t|t
j#rT|j dk	rT|j j!  dS )	zInitialize the weightsg        g{Gz?)Zmeanstdr   )r   rY   r   N)$rU   Zinitializer_factor
isinstancer   r   rs   dataZnormal_rk   rS   r   initre   rb   rg   Zinitializer_ranger   num_hidden_layersr   r   r   r   r   ra   r   r   CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   rX   Zzero_Zfill_r   )rA   modulefactorZin_proj_stdZout_proj_stdZfc_stdr#   r#   r$   _init_weights  sL    

 z$CLIPSegPreTrainedModel._init_weightsFc                 C   s   t |tr||_d S r   )r   CLIPSegEncodergradient_checkpointing)rA   r   valuer#   r#   r$   _set_gradient_checkpointing  s    
z2CLIPSegPreTrainedModel._set_gradient_checkpointingN)F)
rF   rG   rH   rI   r   config_classZbase_model_prefixZsupports_gradient_checkpointingr   r   r#   r#   r#   r$   r     s   +r   aI  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`CLIPSegConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
aE  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                	       s`   e Zd ZdZed fddZd	eej eej ee	 ee	 ee	 e
eef dddZ  ZS )
r   z
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPSegEncoderLayer`].

    Args:
        config: CLIPSegConfig
    rT   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r#   )r   r>   _rT   r#   r$   
<listcomp>T  s     z+CLIPSegEncoder.__init__.<locals>.<listcomp>F)	r_   r`   rU   r   
ModuleListranger   layersr   rm   rn   rT   r$   r`   Q  s    
 zCLIPSegEncoder.__init__N)r   r   r   output_hidden_statesreturn_dictr'   c                    s   dk	r n| j j |dk	r |n| j j}|dk	r4|n| j j}|rDdnd} rPdnd}|}	t| jD ]r\}
}|rx||	f }| jr| jr fdd}tj	j

|||	||}n||	|| d}|d }	 rb||d f }qb|r||	f }|stdd	 |	||fD S t|	||d
S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr#   c                    s    fdd}|S )Nc                     s    | f S r   r#   )inputs)r   r   r#   r$   custom_forward  s    zMCLIPSegEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr#   )r   r   r   )r   r$   create_custom_forward  s    z5CLIPSegEncoder.forward.<locals>.create_custom_forwardr   r   r   c                 s   s   | ]}|d k	r|V  qd S r   r#   r>   vr#   r#   r$   rB     s      z)CLIPSegEncoder.forward.<locals>.<genexpr>)last_hidden_staterM   rN   )rU   r   r   use_return_dict	enumerater   r   r   r   utils
checkpointrD   r
   )rA   r   r   r   r   r   r   Zencoder_statesall_attentionsrM   idxZencoder_layerr   layer_outputsr#   r   r$   r   W  sH    &

  zCLIPSegEncoder.forward)NNNNN)rF   rG   rH   rI   r   r`   r   r   r   r   r   r   r
   r   r   r#   r#   rn   r$   r   H  s   	     
r   )input_ids_shaper   r)   past_key_values_lengthc                 C   s   | \}}t j||ft |j|d}t j|d|d}|||d |ddk d ||}|dkrt j	t j
||||d|gdd}|ddddddf |d||| S )zB
    Make causal mask used for bi-directional self-attention.
    r(   r\   r   r   r   r)   r~   N)r   fullr   r    r+   r   Zmasked_fill_ru   r   rw   Zzerosr   )r   r   r)   r   r!   r   r   Z	mask_condr#   r#   r$   _make_causal_mask  s    "
 r   c                       sx   e Zd Zed fddZeeeeedd	e	e
j e	e
j e	e
j e	e e	e e	e eeef dddZ  ZS )
CLIPSegTextTransformerrT   c                    sH   t    || _|j}t|| _t|| _tj	||j
d| _|j| _d S r   )r_   r`   rU   ra   r   r   r   encoderr   r   r   final_layer_normeos_token_idr   rn   r#   r$   r`     s    


zCLIPSegTextTransformer.__init__output_typer   Nr   r   rZ   r   r   r   r'   c                 C   sn  |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}|dkrLtd| }|d|d }| j||d}t||j	|j
d}	|dk	rt||j	}| j|||	|||d}
|
d }| |}| jdkr|tj|jd |j
d|jtj|j
d	jdd
f }n>|tj|jd |j
d|jtj|j
d	| jk jdd
f }|sZ||f|
dd  S t|||
j|
jdS )
        Returns:

        NzYou have to specify input_idsr\   )r   rZ   r(   )r   r   r   r   r   r   r   rY   r   r~   r   r   pooler_outputrM   rN   )rU   r   r   r   rq   r   ru   r   r   r   r)   r%   r   r   r   r   r+   r   r   rr   Zargmaxr   rM   rN   )rA   r   r   rZ   r   r   r   Zinput_shaperM   r   encoder_outputsr   rQ   r#   r#   r$   r     sV    	
zCLIPSegTextTransformer.forward)NNNNNN)rF   rG   rH   r   r`   r   CLIPSEG_TEXT_INPUTS_DOCSTRINGr   r   r   r   r   r   r   r   r   r   r#   r#   rn   r$   r     s$   
      
r   c                       s   e Zd ZeZddgZed fddZejdddZ	d	d
 Z
eeeeeddeej eej eej ee ee ee eeef dddZ  ZS )CLIPSegTextModelr   r   rT   c                    s"   t  | t|| _|   d S r   )r_   r`   r   
text_model	post_initrm   rn   r#   r$   r`   %  s    
zCLIPSegTextModel.__init__r9   c                 C   s
   | j jjS r   r   r   r   r@   r#   r#   r$   get_input_embeddings+  s    z%CLIPSegTextModel.get_input_embeddingsc                 C   s   || j j_d S r   r   )rA   r   r#   r#   r$   set_input_embeddings.  s    z%CLIPSegTextModel.set_input_embeddingsr   Nr   c                 C   s   | j ||||||dS )aM  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegTextModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rZ   r   r   r   )r   )rA   r   r   rZ   r   r   r   r#   r#   r$   r   1  s    zCLIPSegTextModel.forward)NNNNNN)rF   rG   rH   r   r   Z_no_split_modulesr`   r   Moduler   r   r   r   r   r   r   r   r   r   r   r   r   r   r#   r#   rn   r$   r      s,   
      
r   c                
       sh   e Zd Zed fddZeeeeedd	e	e
j e	e e	e e	e eeef dddZ  ZS )
CLIPSegVisionTransformerrT   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )r_   r`   rU   ra   rS   r   r   r   r   pre_layrnormr   r   post_layernormr   rn   r#   r$   r`   Y  s    


z!CLIPSegVisionTransformer.__init__r   Nr}   r   r   r   r'   c           	      C   s   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}|dkrLtd| |}| |}| j||||d}|d }|dddddf }| |}|s||f|dd  S t	|||j
|jdS )r   Nz You have to specify pixel_values)r   r   r   r   r   r   r   )rU   r   r   r   rq   r   r   r   r   r   rM   rN   )	rA   r}   r   r   r   rM   r   r   rQ   r#   r#   r$   r   c  s2    


z CLIPSegVisionTransformer.forward)NNNN)rF   rG   rH   r   r`   r   CLIPSEG_VISION_INPUTS_DOCSTRINGr   r   r   r   rJ   r   r   r   r   r   r#   r#   rn   r$   r   W  s   

    
r   c                
       s   e Zd ZeZdZed fddZejdddZ	e
eeeeddeej ee ee ee eeef d
ddZ  ZS )CLIPSegVisionModelr}   rT   c                    s"   t  | t|| _|   d S r   )r_   r`   r   vision_modelr   rm   rn   r#   r$   r`     s    
zCLIPSegVisionModel.__init__r9   c                 C   s
   | j jjS r   )r  r   rg   r@   r#   r#   r$   r     s    z'CLIPSegVisionModel.get_input_embeddingsr   Nr  c                 C   s   | j ||||dS )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegVisionModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r}   r   r   r   )r  )rA   r}   r   r   r   r#   r#   r$   r     s    zCLIPSegVisionModel.forward)NNNN)rF   rG   rH   r   r   Zmain_input_namer`   r   r   r   r   r  r   r   r   r   rJ   r   r   r   r   r   r#   r#   rn   r$   r    s"   
    
r  c                       s  e Zd ZeZed fddZeedee	j
 ee	j
 ee	j
 ee ee ee e	jdddZeedee	j ee ee ee e	jdd	d
Zeeeeeddee	j ee	j ee	j
 ee	j ee ee ee ee eeef d	ddZ  ZS )r   rT   c                    s   t  | t|jts.tdt|j dt|jtsPtdt|j d|j}|j}|j	| _	|j
| _|j
| _t|| _t|| _tj| j| j	dd| _tj| j| j	dd| _tt| jj| _|   d S )NzNconfig.text_config is expected to be of type CLIPSegTextConfig but is of type .zRconfig.vision_config is expected to be of type CLIPSegVisionConfig but is of type F)rX   )r_   r`   r   text_configr   rq   typevision_configr   projection_dimra   r   r   r   r   r   r  r   r   r   r   rd   r   r   rU   Zlogit_scale_init_valuelogit_scaler   )rA   rU   r  r	  rn   r#   r$   r`     s(    

zCLIPSegModel.__init__Nr   c           
      C   sh   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| j||||||d}|d }| |}	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```Nr   r   )rU   r   r   r   r   r   )
rA   r   r   rZ   r   r   r   text_outputsrQ   Ztext_featuresr#   r#   r$   get_text_features  s    	
zCLIPSegModel.get_text_featuresr  c                 C   sd   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| j||||d}|d }| |}|S )aI  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```Nr  r   )rU   r   r   r   r  r   )rA   r}   r   r   r   vision_outputsrQ   Zimage_featuresr#   r#   r$   get_image_features  s    
zCLIPSegModel.get_image_featuresr   )	r   r}   r   rZ   return_lossr   r   r   r'   c	              	   C   s*  |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| j||||d}	| j||||||d}
|	d }| |}|
d }| |}||jdddd }||jdddd }| j	
 }t|| | }| }d}|rt|}|s|||||
|	f}|dk	r|f| S |S t||||||
|	d	S )
a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr  r   r   rY   r\   T)r   r   Zkeepdim)r2   r3   r4   r5   r6   r7   r8   )rU   r   r   r   r  r   r   r   Znormr  expr   matmulr/   r0   r1   )rA   r   r}   r   rZ   r  r   r   r   r  r  r6   r5   r  r4   r3   r2   outputr#   r#   r$   r   K  sT    &	


zCLIPSegModel.forward)NNNNNN)NNNN)NNNNNNNN)rF   rG   rH   r   r   r`   r   r   r   r   r   r   rJ   r  r  r  CLIPSEG_INPUTS_DOCSTRINGr   r1   r   r   r   r   r   r#   r#   rn   r$   r     sf          .    0
        
r   c                       sN   e Zd ZdZed fddZd	ejejejee	 e
ej dddZ  ZS )
CLIPSegDecoderLayerz
    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
    self-attention/MLP, rather than before.
    rT   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S r   r   rm   rn   r#   r$   r`     s    


zCLIPSegDecoderLayer.__init__Fr   c                 C   sd   |}| j ||||d\}}|| }| |}|}| |}|| }| |}|f}|r`||f7 }|S r   )r   r   r   r   r   r#   r#   r$   r     s"    




zCLIPSegDecoderLayer.forward)F)rF   rG   rH   rI   r   r`   r   r   r   r   r   rJ   r   r   r#   r#   rn   r$   r    s    r  c                       sN   e Zd Zed fddZd	eej ejee	 ee	 ee	 dddZ
  ZS )
CLIPSegDecoderrT   c                    sX  t     j| _t j j| _t j j| _ j	r j
jd  j
jd f}ttj j jdddt tj j jd |d |d dt tj jd d|d |d d| _ntj jd j
j j
jd| _t j}t fd	d
t|D | _t j
 j_ j_ j_d_tfdd
tt jD | _d S )N   r   r   )rV   paddingrY   r   )rV   rW   )rW   c                    s   g | ]}t  jj jqS r#   )r   r   r	  ra   
reduce_dimr   rT   r#   r$   r     s     z+CLIPSegDecoder.__init__.<locals>.<listcomp>Zreluc                    s   g | ]}t  qS r#   )r  r   )decoder_configr#   r$   r     s     )r_   r`   conditional_layerr   r   r
  r  film_mulfilm_addZ"use_complex_transposed_convolutionr	  rc   Z
Sequentialrf   ZReLUZConvTranspose2dtransposed_convolutionr,   extract_layersr   r   reducescopydeepcopyra   Zdecoder_num_attention_headsr   Zdecoder_intermediate_sizer   r   r   )rA   rU   Ztransposed_kernelsdepthrn   )rU   r  r$   r`     sN          
zCLIPSegDecoder.__init__NT)rM   rP   r   r   r   c                 C   sp  |rdnd }|rdnd }|d d d }d }	t t|| j| jD ]\}
\}}}|	d k	rb|||	 }	n||}	|
| jkr| ||	ddd | | }	|	ddd}	||	d d |d}|d }	|r||	f7 }|r>||d f7 }q>|	d d dd d d f ddd}	tt	
|	jd }|jd }|	||	jd ||}	| |	 }|sbtdd |||fD S t|||d	S )
Nr#   r\   r   r   rY   )r   r   r   c                 s   s   | ]}|d k	r|V  qd S r   r#   r   r#   r#   r$   rB   =  s      z)CLIPSegDecoder.forward.<locals>.<genexpr>)r&   rM   rN   )r   zipr   r   r  r  Zpermuter  rr   r   r   r   ru   r  rv   rD   rL   )rA   rM   rP   r   r   r   Zall_hidden_statesr   activationsr  iZ
activationlayerreducer   r   r   r&   r#   r#   r$   r     sH    "
   
$
zCLIPSegDecoder.forward)NNT)rF   rG   rH   r   r`   r   r   r   r   r   r   r   r#   r#   rn   r$   r    s   .   r  zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    c                       s   e Zd ZeZed fddZdeeej	 eej	 eej	 eej	 dddZ
eeeeeddeej eej eej eej eej	 eej eej ee ee ee eeef d	d
dZ  ZS )CLIPSegForImageSegmentationrT   c                    s:   t  | || _t|| _|j| _t|| _|   d S r   )	r_   r`   rU   r   r   r  r  decoderr   rm   rn   r#   r$   r`   O  s    

z$CLIPSegForImageSegmentation.__init__Nr   r   r   rZ   conditional_pixel_valuesc              	   C   s   |d k	rDt ||krtdt  | jj|||d}W 5 Q R X nF|d k	rt ||kr`tdt  | j|}W 5 Q R X ntd|S )Nz@Make sure to pass as many prompt texts as there are query images)r   rZ   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)r,   rq   r   no_gradr   r  r  )rA   r   r   r   rZ   r,  rP   r#   r#   r$   get_conditional_embeddings\  s$    
  
z6CLIPSegForImageSegmentation.get_conditional_embeddingsr   )r   r}   r,  rP   r   rZ   labelsr   r   r   r'   c              	      s  |
dk	r|
n| j j}
t  | jj||d|
d}| j|d }|
rL|jn|d   fdd| jD }|
rt	|j
|j|	r|jnd|jd}n |	s|dd |d	d  n|}W 5 Q R X |dkr| j|jd
 ||||d}n8|jd
 |jd
 krtd|jd | j jkrtd| j||||	|
d}|
r4|jn|d
 }d}|dk	rh||j}t }|||}|
s|||||f}|dk	r|f| S |S t||||||dS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> texts = ["a cat", "a remote", "a blanket"]
        >>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> print(logits.shape)
        torch.Size([3, 352, 352])
        ```NTr  r   rY   c                    s   g | ]} |d   qS )r   r#   )r>   r&  rM   r#   r$   r     s     z7CLIPSegForImageSegmentation.forward.<locals>.<listcomp>r   r   r   r+  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r   r   r   )r2   r&   rP   rQ   r8   rR   )rU   r   r   r-  r   r  r   rM   r  r   r   r   rN   r.  r   rq   r
  r*  r&   r   r)   r   ZBCEWithLogitsLossrO   )rA   r   r}   r,  rP   r   rZ   r/  r   r   r   r  rQ   r%  Zdecoder_outputsr&   r2   Zloss_fnr  r#   r0  r$   r   y  sx    ,


z#CLIPSegForImageSegmentation.forward)NNNNN)
NNNNNNNNNN)rF   rG   rH   r   r   r`   rr   r   r   r   r.  r   r  r   rO   r   rJ   r   r   r   r   r1   r   r   r#   r#   rn   r$   r)  F  sN        
          
r)  )N)r   )CrI   r!  r   dataclassesr   typingr   r   r   r   r   Ztorch.utils.checkpointr   r%  r	   Zmodeling_outputsr
   r   Zmodeling_utilsr   r   r   r   r   r   r   Zconfiguration_clipsegr   r   r   Z
get_loggerrF   loggerZ_CHECKPOINT_FOR_DOCZ%CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LISTr   r   rr   r%   r-   r0   r1   rL   rO   r   rS   r   r   r   r   r   ZCLIPSEG_START_DOCSTRINGr   r  r  r   Sizer)   r   r   r   r   r  r   r  r  r)  r#   r#   r#   r$   <module>   sr   
%>"i2: 'i    _7<4 b9d