U
    9%e                     @   s`  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZ ddlmZmZmZ ddlm Z m!Z! e"e#Z$dZ%ddddddddgZ&e	j'e	j'dddZ(e	j'e	j'dddZ)eG dd deZ*eG dd  d eZ+eG d!d" d"eZ,eG d#d$ d$eZ-G d%d& d&ej.Z/G d'd( d(ej.Z0G d)d* d*ej.Z1G d+d, d,ej.Z2G d-d. d.ej.Z3G d/d0 d0eZ4d1Z5d2Z6d3Z7d4Z8G d5d6 d6ej.Z9G d7d8 d8e4Z:ee5G d9d: d:e4Z;ed;e5G d<d= d=e4Z<ed>e5G d?d@ d@e4Z=edAe5G dBdC dCe4Z>dS )Dz PyTorch BLIP model.    N)	dataclass)AnyOptionalTupleUnion)nn)	normalize   )ACT2FN)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )
BlipConfigBlipTextConfigBlipVisionConfig)BlipTextLMHeadModelBlipTextModelzSalesforce/blip-vqa-basez!Salesforce/blip-vqa-capfilt-largez%Salesforce/blip-image-captioning-basez&Salesforce/blip-image-captioning-largezSalesforce/blip-itm-base-cocozSalesforce/blip-itm-large-cocozSalesforce/blip-itm-base-flickrz Salesforce/blip-itm-large-flickr)logitsreturnc                 C   s   t j| tjt| | jdS )N)device)r   
functionalZcross_entropytorcharangelenr   )r    r    e/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/blip/modeling_blip.pycontrastive_loss:   s    r"   )
similarityr   c                 C   s    t | }t |  }|| d S )Ng       @)r"   t)r#   Zcaption_lossZ
image_lossr    r    r!   	blip_loss?   s    r%   c                   @   s   e Zd ZU dZdZeeej  e	d< dZ
eeej  e	d< dZeej e	d< dZeje	d< dZeeej  e	d< dZeeej  e	d< ed	d
 ZdS )'BlipForConditionalGenerationModelOutputa  
    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
    last hidden states. This class also adds the loss term from the text decoder.

    Args:
        loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Languge modeling loss from the text decoder.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
            Prediction scores of the language modeling head of the text decoder model.
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*):
            The image embeddings obtained after applying the Vision Transformer model to the input image.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossr   image_embedslast_hidden_statehidden_states
attentionsc                 C   s   t dt | jS )Nz`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers. Please use the `logits` attribute to retrieve the final output instead.)warningswarnFutureWarningr   selfr    r    r!   decoder_logitsh   s
    z6BlipForConditionalGenerationModelOutput.decoder_logits)__name__
__module____qualname____doc__r'   r   r   r   FloatTensor__annotations__r   r(   r)   r*   r+   propertyr1   r    r    r    r!   r&   E   s   
r&   c                   @   sp   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
ejed< dZeeej  ed< dZeeej  ed< dS )BlipTextVisionModelOutputa  
    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
    last hidden states. This class also adds the loss term from the text decoder.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Languge modeling loss from the text decoder.
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nr'   r(   r)   r*   r+   )r2   r3   r4   r5   r'   r   r   r6   r7   r(   r)   r*   r   r+   r    r    r    r!   r9   r   s   
r9   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZejed< dZeeej  ed< dZeej ed< dZeeej  ed	< dZeeej  ed
< dS ) BlipImageTextMatchingModelOutputa  
    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
    last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity
    scores.

    Args:
        itm_score (`torch.FloatTensor`):
            The image-text similarity scores.
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Languge modeling loss from the text decoder.
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        vision_pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*):
            Last layer hidden-state of the vision of the vision-only branch of the model.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        question_embeds (`torch.FloatTensor`):
            The question embeddings obtained by the text projection layer.
    N	itm_scorer'   r(   r)   r*   vision_pooler_outputr+   question_embeds)r2   r3   r4   r5   r;   r   r   r6   r7   r'   r(   r)   r*   r   r<   r+   r=   r    r    r    r!   r:      s   
r:   c                   @   s   e Zd ZU dZdZeej ed< dZ	ejed< dZ
ejed< dZejed< dZejed< dZeed< dZeed	< ee d
ddZdS )
BlipOutputa  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`].
        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`].
        text_model_output(`BaseModelOutputWithPooling`):
            The output of the [`BlipTextModel`].
        vision_model_output(`BaseModelOutputWithPooling`):
            The output of the [`BlipVisionModel`].
    Nr'   logits_per_imagelogits_per_texttext_embedsr(   text_model_outputvision_model_outputr   c                    s   t  fdd  D S )Nc                 3   s,   | ]$}|d kr | nt  | V  qdS ))rB   rC   N)getattrto_tuple).0kr/   r    r!   	<genexpr>   s   z&BlipOutput.to_tuple.<locals>.<genexpr>)tuplekeysr/   r    r/   r!   rF      s    zBlipOutput.to_tuple)r2   r3   r4   r5   r'   r   r   r6   r7   r?   r@   rA   r(   rB   r   rC   r   r   rF   r    r    r    r!   r>      s   
r>   c                       s6   e Zd Zed fddZejejdddZ  Z	S )BlipVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
dd| j| _tjd| j| j| jd| _| j| j d | _| jd | _tt	
d| j| j| _d S )Nr   r	   )Zin_channelsZout_channelsZkernel_sizeZstride   )super__init__rN   hidden_size	embed_dimZ
image_sizeZ
patch_sizer   	Parameterr   Zrandnclass_embeddingConv2dpatch_embeddingZnum_patchesZnum_positionsposition_embeddingr0   rN   	__class__r    r!   rQ      s    
   zBlipVisionEmbeddings.__init__)pixel_valuesr   c                 C   s   |j d }| jjj}| |j|d}|ddd}| j|dd|}t	j
||gdd}|| jd d d |dd d f | }|S )Nr   dtyperO   r   dim)shaperW   weightr^   toflatten	transposerU   expandr   catrX   size)r0   r\   
batch_sizeZtarget_dtypeZpatch_embedsZclass_embeds
embeddingsr    r    r!   forward   s    

,zBlipVisionEmbeddings.forward)
r2   r3   r4   r   rQ   r   r6   Tensorrl   __classcell__r    r    rZ   r!   rL      s   rL   c                       sL   e Zd Zed fddZdeej eej eej ej	dddZ
  ZS )	BlipTextEmbeddingsrM   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )Nposition_ids)r   r_   F)
persistent)rP   rQ   rR   r   	EmbeddingZ
vocab_sizetoken_embeddingZmax_position_embeddingsrX   Zregister_bufferr   r   rg   r0   rN   rS   rZ   r    r!   rQ     s    
  zBlipTextEmbeddings.__init__N)	input_idsrp   inputs_embedsr   c                 C   sb   |d k	r|j d n|j d }|d kr:| jd d d |f }|d krL| |}| |}|| }|S )Nr_   )rb   rp   rs   rX   )r0   ru   rp   rv   Z
seq_lengthZposition_embeddingsrk   r    r    r!   rl     s    

zBlipTextEmbeddings.forward)NNN)r2   r3   r4   r   rQ   r   r   
LongTensorr6   rm   rl   rn   r    r    rZ   r!   ro     s      ro   c                
       sr   e Zd ZdZ fddZejeedddZdeje	ej e	e
 eeje	ej e	eej  f d	d
dZ  ZS )BlipAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkrZtd| j d| j d| jd | _	t
|j| _t
| jd| j | _t
| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r	   )rP   rQ   rN   rR   rS   Znum_attention_heads	num_headshead_dim
ValueErrorscaler   ZDropoutZattention_dropoutdropoutLinearqkv
projectionrY   rZ   r    r!   rQ   '  s    
zBlipAttention.__init__)tensorseq_lenbszc                 C   s    | ||| j| jdd S )Nr   rO   )viewrz   r{   rf   
contiguous)r0   r   r   r   r    r    r!   _shape9  s    zBlipAttention._shapeNF)r*   	head_maskoutput_attentionsr   c                 C   s   |  \}}}| |||d| j|| j ddddd}|d |d |d   }}	}
t||	dd}|| j }t	j
j|dd}| |}|d	k	r|| }t||
dddd}|  d	d | jf }||}| |}|r||fn|d	f}|S )
z#Input shape: Batch x Time x Channelr	   rO   r   r      r_   rw   r`   N)ri   r   Zreshaperz   Zpermuter   matmulrf   r}   r   r   Zsoftmaxr~   rS   r   )r0   r*   r   r   r   Ztgt_lenrS   Z	mixed_qkvZquery_statesZ
key_statesZvalue_statesZattention_scoresZattention_probsZcontext_layerZnew_context_layer_shapeoutputoutputsr    r    r!   rl   <  s8    
        



zBlipAttention.forward)NF)r2   r3   r4   r5   rQ   r   rm   intr   r   boolr   rl   rn   r    r    rZ   r!   ry   $  s     ry   c                       s0   e Zd Z fddZejejdddZ  ZS )BlipMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)rP   rQ   rN   r
   Z
hidden_actactivation_fnr   r   rR   Zintermediate_sizefc1fc2rY   rZ   r    r!   rQ   k  s
    
zBlipMLP.__init__)r*   r   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r0   r*   r    r    r!   rl   r  s    


zBlipMLP.forward)r2   r3   r4   rQ   r   rm   rl   rn   r    r    rZ   r!   r   j  s   r   c                       sF   e Zd Zed fddZdejejee e	ej
 dddZ  ZS )	BlipEncoderLayerrM   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S N)Zeps)rP   rQ   rR   rS   ry   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rY   rZ   r    r!   rQ   z  s    


zBlipEncoderLayer.__init__F)r*   attention_maskr   r   c                 C   sb   |}|  |}| j|||d\}}|| }|}| |}| |}|| }|f}|r^||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r*   r   r   )r   r   r   r   )r0   r*   r   r   ZresidualZattn_weightsr   r    r    r!   rl     s     




zBlipEncoderLayer.forward)F)r2   r3   r4   r   rQ   r   rm   r   r   r   r6   rl   rn   r    r    rZ   r!   r   y  s    r   c                   @   s.   e Zd ZdZeZdZdZdd Zd
ddZ	d	S )BlipPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    ZblipTc                 C   s   | j j}t|tjs,t|tjs,t|tjr^|jjj	d|d t
|dr^|jdk	r^|jj  t|trt
| j dr~| j jj}tjj|jd|d tjj|jd|d nJt|tjr|jj  |jjd n"t|tjr|jdk	r|jj  dS )zInitialize the weightsg        )meanZstdbiasNvision_configg      ?)rN   Zinitializer_range
isinstancer   rV   rr   r   rc   dataZnormal_hasattrr   Zzero_rL   r   initZtrunc_normal_rX   rU   r   Zfill_)r0   modulefactorr    r    r!   _init_weights  s.    $

z!BlipPreTrainedModel._init_weightsFc                 C   s   t |tr||_d S r   )r   BlipEncodergradient_checkpointing)r0   r   valuer    r    r!   _set_gradient_checkpointing  s    
z/BlipPreTrainedModel._set_gradient_checkpointingN)F)
r2   r3   r4   r5   r   config_classZbase_model_prefixZsupports_gradient_checkpointingr   r   r    r    r    r!   r     s   r   a=  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`BlipConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aN  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
aE  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                       sX   e Zd ZdZed fddZd	eej ee	 ee	 ee	 e
eef dddZ  ZS )
r   z
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`BlipEncoderLayer`].

    Args:
        config (`BlipConfig`):
            The corresponding vision configuration for the `BlipEncoder`.
    rM   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r    )r   )rG   _rM   r    r!   
<listcomp>D  s     z(BlipEncoder.__init__.<locals>.<listcomp>F)	rP   rQ   rN   r   Z
ModuleListrangeZnum_hidden_layerslayersr   rY   rZ   rM   r!   rQ   A  s    
 zBlipEncoder.__init__N)r   r   output_hidden_statesreturn_dictr   c                    s
   dk	r n| j j |dk	r |n| j j}|dk	r4|n| j j}|rDdnd} rPdnd}|}t| jD ]n\}	}
|rx||f }| jr| jr fdd}tj	j

||
||}n|
|| d}|d } rb||d f }qb|r||f }|stdd	 |||fD S t|||d
S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr    c                    s    fdd}|S )Nc                     s    | f S r   r    )inputs)r   r   r    r!   custom_forwards  s    zJBlipEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr    )r   r   r   )r   r!   create_custom_forwardr  s    z2BlipEncoder.forward.<locals>.create_custom_forwardr   r   r   c                 s   s   | ]}|d k	r|V  qd S r   r    )rG   vr    r    r!   rI     s      z&BlipEncoder.forward.<locals>.<genexpr>)r)   r*   r+   )rN   r   r   use_return_dict	enumerater   r   Ztrainingr   utils
checkpointrJ   r   )r0   rv   r   r   r   r   Zencoder_statesZall_attentionsr*   idxZencoder_layerr   Zlayer_outputsr    r   r!   rl   G  sD    

  zBlipEncoder.forward)NNNN)r2   r3   r4   r5   r   rQ   r   r   rm   r   r   r   r   rl   rn   r    r    rZ   r!   r   7  s   		    
r   c                
       sx   e Zd ZdZeZed fddZeee	e
eddeej ee ee ee eee
f ddd	Zd
d Z  ZS )BlipVisionModelr\   rM   c                    sJ   t  | || _|j}t|| _t|| _tj	||j
d| _|   d S r   )rP   rQ   rN   rR   rL   rk   r   encoderr   r   r   post_layernorm	post_initrt   rZ   r    r!   rQ     s    

zBlipVisionModel.__init__output_typer   N)r\   r   r   r   r   c           	      C   s   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}|dkrLtd| |}| j||||d}|d }| |}|dddddf }| |}|s||f|dd  S t|||j	|j
dS )z
        Returns:

        Nz You have to specify pixel_values)rv   r   r   r   r   r   )r)   Zpooler_outputr*   r+   )rN   r   r   r   r|   rk   r   r   r   r*   r+   )	r0   r\   r   r   r   r*   Zencoder_outputsr)   pooled_outputr    r    r!   rl     s2    


zBlipVisionModel.forwardc                 C   s   | j S r   )rk   r/   r    r    r!   get_input_embeddings  s    z$BlipVisionModel.get_input_embeddings)NNNN)r2   r3   r4   main_input_namer   r   rQ   r   BLIP_VISION_INPUTS_DOCSTRINGr   r   r   r   r6   r   r   r   rl   r   rn   r    r    rZ   r!   r     s"   
    
-r   c                       s   e Zd ZeZed fddZeedee	j
 ee	j
 ee	j
 ee e	jdddZeedee	j ee e	jdd	d
Zeeeeeddee	j ee	j ee	j
 ee	j ee ee ee ee eeef d	ddZ  ZS )	BlipModelrM   c                    s   t  | t|jts.tdt|j dt|jtsPtdt|j d|j}|j}|j	| _	|j
| _|j
| _t|| _t|| _tj| j| j	dd| _tj| j| j	dd| _tt| jj| _|   d S )NzKconfig.text_config is expected to be of type BlipTextConfig but is of type .zOconfig.vision_config is expected to be of type BlipVisionConfig but is of type F)r   )rP   rQ   r   text_configr   r|   typer   r   Zprojection_dimrR   Ztext_embed_dimZvision_embed_dimr   
text_modelr   vision_modelr   r   visual_projectiontext_projectionrT   r   r   rN   Zlogit_scale_init_valuelogit_scaler   )r0   rN   r   r   rZ   r    r!   rQ     s(    

zBlipModel.__init__N)ru   r   rp   r   r   c                 C   s<   |dk	r|n| j j}| j||||d}|d }| |}|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`BlipTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoProcessor, BlipModel

        >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```N)ru   r   rp   r   r   )rN   r   r   r   )r0   ru   r   rp   r   text_outputsr   Ztext_featuresr    r    r!   get_text_features  s    
zBlipModel.get_text_features)r\   r   r   c                 C   s8   |dk	r|n| j j}| j||d}|d }| |}|S )aV  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`BlipVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, BlipModel

        >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```N)r\   r   r   )rN   r   r   r   )r0   r\   r   vision_outputsr   Zimage_featuresr    r    r!   get_image_features  s
    
zBlipModel.get_image_featuresr   )	ru   r\   r   rp   return_lossr   r   r   r   c	              	   C   s*  |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| j||||d}	| j||||||d}
|	d }| |}|
d }| |}||jdddd }||jdddd }| j	
 }t|| | }| }d}|rt|}|s|||||
|	f}|dk	r|f| S |S t||||||
|	d	S )
a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, BlipModel

        >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr\   r   r   r   )ru   r   rp   r   r   r   r   rO   r_   T)pra   Zkeepdim)r'   r?   r@   rA   r(   rB   rC   )rN   r   r   r   r   r   r   r   Znormr   expr   r   r$   r%   r>   )r0   ru   r\   r   rp   r   r   r   r   r   r   r(   rA   r   r@   r?   r'   r   r    r    r!   rl   D  sT    &	


zBlipModel.forward)NNNN)NN)NNNNNNNN)r2   r3   r4   r   r   rQ   r   BLIP_TEXT_INPUTS_DOCSTRINGr   r   rm   r   r6   r   r   r   BLIP_INPUTS_DOCSTRINGr   r>   rx   r   r   rl   rn   r    r    rZ   r!   r     sV        %  $
        
r   a  
    BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass
    `input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise,
    the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption
    from the text input. If no text input is provided, the decoder will start with the [BOS] token only.
    c                       s   e Zd ZeZdgZdZed fddZej	dddZ
eeeeed	dejeej eej ee ee eej ee eeef dddZe dejeej eej ejdddZ  ZS )BlipForConditionalGeneration)text_decoder.cls.predictions.decoder.biasr\   rM   c                    sD   t  | t|j| _t|j| _|jj| _	|jj
| _|   d S r   )rP   rQ   r   r   r   r   r   text_decoderbos_token_iddecoder_input_idspad_token_iddecoder_pad_token_idr   rY   rZ   r    r!   rQ     s    

z%BlipForConditionalGeneration.__init__rD   c                 C   s
   | j jjS r   r   rk   rW   r/   r    r    r!   r     s    z1BlipForConditionalGeneration.get_input_embeddingsr   N)r\   ru   r   r   r   labelsr   r   c                 C   s   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| j||||d}|d }	| j|||	||dd}
|s|
d |
d |	|d f|dd  }
tdd	 |
D S t|
j|
j	|	|j
|j|jd
S )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, BlipForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "A picture of"

        >>> inputs = processor(images=image, text=text, return_tensors="pt")

        >>> outputs = model(**inputs)
        ```Nr   r   r   )ru   r   encoder_hidden_statesr   r   	reductionr   rO   c                 s   s   | ]}|d k	r|V  qd S r   r    rG   r   r    r    r!   rI     s      z7BlipForConditionalGeneration.forward.<locals>.<genexpr>)r'   r   r(   r)   r*   r+   )rN   r   r   r   r   r   rJ   r&   r'   r   r)   r*   r+   )r0   r\   ru   r   r   r   r   r   r   r(   r   r    r    r!   rl     s<    "	$z$BlipForConditionalGeneration.forward)r\   ru   r   r   c           
   	   K   s   |j d }| j|d}|d }tj| dd tjd|j}t|t	rXt
|}n0|dkrt
| j| jjjgg|d|j}| jjj|dddf< |dk	r|ddddf nd}| jjf |ddddf | jjj| jjj|||d|}	|	S )a  
        Overrides *generate* function to be able to use the model as a conditional generator

        Parameters:
            pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*:
                Input image to be processed
            input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:


        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, BlipForConditionalGeneration

        >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model.generate(**inputs)
        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
        two cats sleeping on a couch
        ```
        r   r\   Nr_   r]   r   )ru   eos_token_idr   r   r   encoder_attention_mask)rb   r   r   onesri   longrd   r   r   listrx   r   rN   r   r   repeatr   r   generatesep_token_idr   )
r0   r\   ru   r   generate_kwargsrj   r   r(   image_attention_maskr   r    r    r!   r     s6    (
$
  
z%BlipForConditionalGeneration.generate)NNNNNN)NN)r2   r3   r4   r   r   _tied_weights_keysr   rQ   r   Moduler   r   r   r   r&   r   r   r6   r   rx   r   r   r   rl   no_gradr   rn   r    r    rZ   r!   r     s@   

      
E  r   aS  
    BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text
    decoder. The vision encoder will encode the input image, the text encoder will encode the input question together
    with the encoding of the image, and the text decoder will output the answer to the question.
    c                       s   e Zd ZeZdgZed fddZejdddZ	e
eeeeddejejeej eej eej ee ee eej ee eeef d

ddZe dejejeej ejdddZ  ZS )BlipForQuestionAnsweringr   rM   c                    sT   t  | t|j| _t|jdd| _t|j| _	|jj
| _|jj| _|   d S )NFZadd_pooling_layer)rP   rQ   r   r   r   r   r   text_encoderr   r   r   r   r   decoder_start_token_idr   rY   rZ   r    r!   rQ   [  s    

z!BlipForQuestionAnswering.__init__rD   c                 C   s
   | j jjS r   r   r/   r    r    r!   r   j  s    z-BlipForQuestionAnswering.get_input_embeddingsr   N)
ru   r\   r   decoder_attention_maskr   r   r   r   r   r   c
              	   C   sP  |dkr|dkrt d|	dk	r$|	n| jj}	|dk	r8|n| jj}|dk	rL|n| jj}| j||||	d}
|
d }tj| dd tj	d}| j
|||||	d}|dk	r|dkr|}|	s|d n|j}| j||||||	dd	}|dk	r|	r|j n
|d  }nd}|	s8|||
d f|
d
d  }tdd |D S t|||
j|
j|
jdS )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, BlipForQuestionAnswering

        >>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> # training
        >>> text = "How many cats are in the picture?"
        >>> label = "2"
        >>> inputs = processor(images=image, text=text, return_tensors="pt")
        >>> labels = processor(text=label, return_tensors="pt").input_ids

        >>> inputs["labels"] = labels
        >>> outputs = model(**inputs)
        >>> loss = outputs.loss
        >>> loss.backward()

        >>> # inference
        >>> text = "How many cats are in the picture?"
        >>> inputs = processor(images=image, text=text, return_tensors="pt")
        >>> outputs = model.generate(**inputs)
        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
        2
        ```Na  Either `decoder_input_ids` or `labels` should be passed when calling `forward` with `BlipForQuestionAnswering`. if you are training the model make sure that `labels` is passed, if you are using the model for inference make sure that `decoder_input_ids` is passed or call `generate`r   r   r_   r]   ru   r   r   r   r   r   )ru   r   r   r   r   r   r   rO   c                 s   s   | ]}|d k	r|V  qd S r   r    r   r    r    r!   rI     s      z3BlipForQuestionAnswering.forward.<locals>.<genexpr>)r'   r(   r)   r*   r+   )r|   rN   r   r   r   r   r   r   ri   r   r   r)   r   r'   r   rJ   r9   r*   r+   )r0   ru   r\   r   r   r   r   r   r   r   r   r(   r   r=   Zanswer_outputZdecoder_lossr   r    r    r!   rl   m  s`    0

z BlipForQuestionAnswering.forward)ru   r\   r   r   c                 K   s   | j |d}|d }tj| dd tjd|j}t|trLt	|}| j
||||dd}|d }	tj|	 dd tjd|	j}
tj|	ddf| j|	jd	}| jjf || jjj| jjj|	|
d
|}|S )aD  
        Overrides *generate* function to be able to use the model as a conditional generator

        Parameters:
            input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*):
                The sequence used as a prompt for the generation.
            pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*:
                Input image to be processed
            attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for
                tokens that are NOT MASKED, `0` for MASKED tokens.
            **generate_kwargs:
                Additional arguments passed to the *generate* function of the decoder


        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, BlipForQuestionAnswering

        >>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "How many cats are in the picture?"

        >>> inputs = processor(images=image, text=text, return_tensors="pt")

        >>> outputs = model.generate(**inputs)
        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
        2
        ```
        r   r   Nr_   r]   Fr   r   )Z
fill_valuer   )ru   r   r   r   r   )r   r   r   ri   r   rd   r   r   r   rx   r   fullr   r   r   rN   r   r   r   )r0   ru   r\   r   r   r   r(   r   Zquestion_outputsr=   Zquestion_attention_maskZbos_idsr   r    r    r!   r     s:    +$

$  	z!BlipForQuestionAnswering.generate)NNNNNNN)N)r2   r3   r4   r   r   r   rQ   r   r   r   r   r   r   r9   r   r   rx   r6   r   r   r   r   rl   r   r   rn   r    r    rZ   r!   r   O  sB   	
       
n r   a   
    BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of
    image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to
    the image.
    c                       s   e Zd ZeZed fddZejdddZe	e
eeeddejejee eej ee ee ee eeef d
ddZ  ZS )BlipForImageTextRetrievalrM   c                    s   t  | t|j| _t|jdd| _t	|jj
|j| _t	|jj
|j| _t	|jj
d| _t|dst|jjn|j| _t|ds|jjn|j| _|   d S )NFr   rO   r   r   )rP   rQ   r   r   r   r   r   r   r   r   rR   Zimage_text_hidden_sizevision_proj	text_projitm_headr   r   r   r   r   r   rY   rZ   r    r!   rQ   8  s    

z"BlipForImageTextRetrieval.__init__rD   c                 C   s
   | j jjS r   r   r/   r    r    r!   r   V  s    z.BlipForImageTextRetrieval.get_input_embeddingsr   TN)ru   r\   use_itm_headr   r   r   r   r   c                 C   s~  |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| j||||d}|d }	tj|	 dd tjd}
|r| j	|||	|
|d}|s|d n|j
}| |dddddf }nv| j	|||d}|s|d n|j
}t| |	dddddf dd}t| |dddddf dd}||  }|sf||d f|d	d  |f }td
d |D S t||j
|j|j|dS )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, BlipForImageTextRetrieval

        >>> model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "an image of a cat"

        >>> inputs = processor(images=image, text=text, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```
        Nr   r   r_   r]   r   )ru   r   r   r`   rO   c                 s   s   | ]}|d k	r|V  qd S r   r    r   r    r    r!   rI     s      z4BlipForImageTextRetrieval.forward.<locals>.<genexpr>)r;   r)   r*   r+   r=   )rN   r   r   r   r   r   r   ri   r   r   r)   r   r   r   r   r$   rJ   r:   r*   r+   )r0   ru   r\   r   r   r   r   r   r   r(   Z
image_attsr=   r   Z
image_featZ	text_featr   r    r    r!   rl   Y  sR    !$$z!BlipForImageTextRetrieval.forward)TNNNN)r2   r3   r4   r   r   rQ   r   r   r   r   r   r   r9   r   r   rx   r6   r   r   r   r   rl   rn   r    r    rZ   r!   r   -  s(   	
     
r   )?r5   r,   dataclassesr   typingr   r   r   r   r   Ztorch.utils.checkpointr   Ztorch.nn.functionalr   Zactivationsr
   Zmodeling_outputsr   r   Zmodeling_utilsr   r   r   r   r   r   r   Zconfiguration_blipr   r   r   Zmodeling_blip_textr   r   Z
get_loggerr2   loggerZ_CHECKPOINT_FOR_DOCZ"BLIP_PRETRAINED_MODEL_ARCHIVE_LISTrm   r"   r%   r&   r9   r:   r>   r   rL   ro   ry   r   r   r   ZBLIP_START_DOCSTRINGr   r   r   r   r   r   r   r   r   r    r    r    r!   <module>   s   
, *$ !F0,%\B M	 % W