U
    ,-eM/                    @   s  d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
ZddlZddlZddlmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZm Z m!Z! e rddl"m#Z# e$e%Z&dZ'dddgZ(dQejej)ee* dddZ+ejejdddZ,ejejdddZ-eG dd deZ.eedddZ/eedd d!Z0d"d# Z1d$d% Z2eG d&d' d'eZ3eG d(d) d)eZ4G d*d+ d+ej5Z6G d,d- d-ej5Z7G d.d/ d/ej5Z8G d0d1 d1ej5Z9G d2d3 d3ej5Z:G d4d5 d5eZ;d6Z<d7Z=d8Z>d9Z?d:Z@d;ZAG d<d= d=ej5ZBdRejCej)ejDe*d>d?d@ZEG dAdB dBej5ZFG dCdD dDe;ZGG dEdF dFej5ZHG dGdH dHe;ZIee<G dIdJ dJe;ZJG dKdL dLej5ZKG dMdN dNej5ZLG dOdP dPe;ZMdS )Sz PyTorch OWL-ViT model.    N)	dataclass)AnyDictOptionalTupleUnion)Tensornn   )ACT2FN)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardis_vision_availableloggingreplace_return_docstrings   )OwlViTConfigOwlViTTextConfigOwlViTVisionConfig)center_to_corners_formatzgoogle/owlvit-base-patch32zgoogle/owlvit-base-patch16zgoogle/owlvit-large-patch14)maskdtypetgt_lenc                 C   sj   |   \}}|dk	r|n|}| ddddddf |d|||}d| }||tjt|jS )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    Nr         ?)sizeexpandtoZmasked_filltorchboolfinfomin)r   r   r   bszsrc_lenZexpanded_maskZinverted_mask r'   k/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/owlvit/modeling_owlvit.py_expand_mask:   s
    *r)   )logitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)r	   
functionalZcross_entropyr!   arangelenr-   )r*   r'   r'   r(   contrastive_lossI   s    r1   )
similarityr+   c                 C   s    t | }t |  }|| d S )Ng       @)r1   t)r2   Zcaption_lossZ
image_lossr'   r'   r(   owlvit_lossN   s    r4   c                   @   s   e Zd ZU dZdZeej ed< dZ	ejed< dZ
ejed< dZejed< dZejed< dZeed< dZeed	< ee d
ddZdS )OwlViTOutputa%  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds (`torch.FloatTensor` of shape `(batch_size * num_max_text_queries, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`OwlViTTextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`OwlViTVisionModel`].
        text_model_output (Tuple[`BaseModelOutputWithPooling`]):
            The output of the [`OwlViTTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`OwlViTVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr+   c                    s   t  fdd  D S )Nc                 3   s,   | ]$}|d kr | nt  | V  qdS )r;   r<   Ngetattrto_tuple.0kselfr'   r(   	<genexpr>t   s   z(OwlViTOutput.to_tuple.<locals>.<genexpr>tuplekeysrE   r'   rE   r(   rA   s   s    zOwlViTOutput.to_tuple)__name__
__module____qualname____doc__r6   r   r!   FloatTensor__annotations__r7   r8   r9   r:   r;   r   r<   r   r   rA   r'   r'   r'   r(   r5   T   s   
r5   )r3   r+   c                 C   sH   |   r&| jtjtjfkr| S |  S | jtjtjfkr<| S |  S d S N)	Zis_floating_pointr   r!   float32float64floatZint32Zint64int)r3   r'   r'   r(   _upcast{   s    rV   )boxesr+   c                 C   sH   t | } | dddf | dddf  | dddf | dddf   S )a  
    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.

    Args:
        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
            < x2` and `0 <= y1 < y2`.

    Returns:
        `torch.FloatTensor`: a tensor containing the area for each box.
    N   r   r
   r   )rV   )rW   r'   r'   r(   box_area   s    rY   c           
      C   s   t | }t |}t| d d d d df |d d d df }t| d d d dd f |d d dd f }|| jdd}|d d d d df |d d d d df  }|d d d f | | }|| }	|	|fS )NrX   r   r$   r   )rY   r!   maxr$   clamp)
boxes1boxes2Zarea1Zarea2Zleft_topZright_bottomwidth_heightZinterunioniour'   r'   r(   box_iou   s    ..,rb   c                 C   s*  | ddddf | ddddf k  s:td|  |ddddf |ddddf k  sttd| t| |\}}t| dddddf |ddddf }t| dddddf |ddddf }|| jdd}|dddddf |dddddf  }||| |  S )z
    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.

    Returns:
        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
    NrX   z<boxes1 must be in [x0, y0, x1, y1] (corner) format, but got z<boxes2 must be in [x0, y0, x1, y1] (corner) format, but got r   rZ   r   )all
ValueErrorrb   r!   r$   r[   r\   )r]   r^   ra   r`   top_leftbottom_rightr_   Zarear'   r'   r(   generalized_box_iou   s    	,,..,rg   c                   @   s   e Zd ZU dZdZeej ed< dZ	ee
 ed< dZejed< dZejed< dZejed< dZejed< dZejed	< dZeed
< dZeed< ee dddZdS )OwlViTObjectDetectionOutputa  
    Output type of [`OwlViTForObjectDetection`].

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
            scale-invariant IoU loss.
        loss_dict (`Dict`, *optional*):
            A dictionary containing the individual losses. Useful for logging.
        logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
            Classification logits (including no-object) for all queries.
        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
            possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to retrieve the
            unnormalized bounding boxes.
        text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`OwlViTTextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
            Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
            image embeddings for each patch.
        class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
            Class embeddings of all image patches. OWL-ViT represents images as a set of image patches where the total
            number of patches is (image_size / patch_size)**2.
        text_model_output (Tuple[`BaseModelOutputWithPooling`]):
            The output of the [`OwlViTTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`OwlViTVisionModel`].
    Nr6   	loss_dictr*   
pred_boxesr9   r:   class_embedsr;   r<   r=   c                    s   t  fdd  D S )Nc                 3   s,   | ]$}|d kr | nt  | V  qdS r>   r?   rB   rE   r'   r(   rG      s   z7OwlViTObjectDetectionOutput.to_tuple.<locals>.<genexpr>rH   rE   r'   rE   r(   rA      s    z$OwlViTObjectDetectionOutput.to_tuple)rK   rL   rM   rN   r6   r   r!   rO   rP   ri   r   r*   rj   r9   r:   rk   r;   r   r<   r   r   rA   r'   r'   r'   r(   rh      s   
rh   c                   @   s   e Zd ZU dZdZejed< dZejed< dZ	ejed< dZ
ejed< dZejed< dZejed< dZeed	< dZeed
< ee dddZdS )&OwlViTImageGuidedObjectDetectionOutputa  
    Output type of [`OwlViTForObjectDetection.image_guided_detection`].

    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
            Classification logits (including no-object) for all queries.
        target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual target image in the batch
            (disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
            retrieve the unnormalized bounding boxes.
        query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual query image in the batch
            (disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
            retrieve the unnormalized bounding boxes.
        image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
            Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
            image embeddings for each patch.
        query_image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
            Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
            image embeddings for each patch.
        class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
            Class embeddings of all image patches. OWL-ViT represents images as a set of image patches where the total
            number of patches is (image_size / patch_size)**2.
        text_model_output (Tuple[`BaseModelOutputWithPooling`]):
            The output of the [`OwlViTTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`OwlViTVisionModel`].
    Nr*   r:   query_image_embedstarget_pred_boxesquery_pred_boxesrk   r;   r<   r=   c                    s   t  fdd  D S )Nc                 3   s,   | ]$}|d kr | nt  | V  qdS r>   r?   rB   rE   r'   r(   rG     s   zBOwlViTImageGuidedObjectDetectionOutput.to_tuple.<locals>.<genexpr>rH   rE   r'   rE   r(   rA     s    z/OwlViTImageGuidedObjectDetectionOutput.to_tuple)rK   rL   rM   rN   r*   r!   rO   rP   r:   rm   rn   ro   rk   r;   r   r<   r   r   rA   r'   r'   r'   r(   rl      s   
rl   c                       s6   e Zd Zed fddZejejdddZ  Z	S )OwlViTVisionEmbeddingsconfigc                    s   t    || _|j| _tt|j| _	tj
|j| j|j|jdd| _|j|j d | _| jd | _t| j| j| _| jdt| jddd d S )NF)Zin_channelsZout_channelsZkernel_sizeZstridebiasrX   r   position_idsr   
persistent)super__init__rr   hidden_size	embed_dimr	   	Parameterr!   Zrandnclass_embeddingZConv2dZnum_channelsZ
patch_sizepatch_embeddingZ
image_sizenum_patchesZnum_positions	Embeddingposition_embeddingregister_bufferr/   r   rF   rr   	__class__r'   r(   rz   "  s    
zOwlViTVisionEmbeddings.__init__)pixel_valuesr+   c                 C   s\   |j d }| |}|ddd}| j|dd}tj||gdd}|| | j	 }|S )Nr   rX   r   rv   dim)
shaper   flatten	transposer~   r   r!   catr   rt   )rF   r   
batch_sizeZpatch_embedsrk   
embeddingsr'   r'   r(   forward5  s    

zOwlViTVisionEmbeddings.forward)
rK   rL   rM   r   rz   r!   rO   r   r   __classcell__r'   r'   r   r(   rp   !  s   rp   c                       sL   e Zd Zed fddZdeej eej eej ej	dddZ
  ZS )	OwlViTTextEmbeddingsrq   c                    sP   t    t|j|j| _t|j|j| _| j	dt
|jddd d S )Nrt   ru   Frw   )ry   rz   r	   r   Z
vocab_sizer{   token_embeddingZmax_position_embeddingsr   r   r!   r/   r   r   r   r'   r(   rz   B  s    
  zOwlViTTextEmbeddings.__init__N)	input_idsrt   inputs_embedsr+   c                 C   sb   |d k	r|j d n|j d }|d kr:| jd d d |f }|d krL| |}| |}|| }|S )Nrv   )r   rt   r   r   )rF   r   rt   r   Z
seq_lengthZposition_embeddingsr   r'   r'   r(   r   L  s    

zOwlViTTextEmbeddings.forward)NNN)rK   rL   rM   r   rz   r   r!   
LongTensorrO   r   r   r   r'   r'   r   r(   r   A  s      r   c                       sz   e Zd ZdZ fddZejeedddZdeje	ej e	ej e	e
 eeje	ej e	eej  f d	d
dZ  ZS )OwlViTAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkrZtd| j d| j d| jd | _	|j
| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      )ry   rz   rr   r{   r|   Znum_attention_heads	num_headshead_dimrd   scaleZattention_dropoutdropoutr	   Lineark_projv_projq_projout_projr   r   r'   r(   rz   c  s    
zOwlViTAttention.__init__)tensorseq_lenr%   c                 C   s    | ||| j| jdd S )Nr   rX   )viewr   r   r   
contiguous)rF   r   r   r%   r'   r'   r(   _shapev  s    zOwlViTAttention._shapeNFhidden_statesattention_maskcausal_attention_maskoutput_attentionsr+   c                 C   s  |  \}}}| || j }| | |d|}	| | |d|}
|| j d| jf}| |||j| }|	j| }	|
j| }
|	 d}t	
||	dd}|  || j ||fkrtd|| j ||f d|   |dk	rD|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}|dk	r|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}tjj|dd}|r||| j||}||| j ||}nd}tjj|| j| jd	}||
j}t	
||
}|  || j || jfkr^td
|| j|| jf d|   ||| j|| j}|dd}||||}| |}||fS )z#Input shape: Batch x Time x Channelrv   r   rX   z$Attention weights should be of size z	, but is Nz!Attention mask should be of size r   )ptrainingz `attn_output` should be of size )r   r   r   r   r   r   r   r   r   r!   Zbmmr   rd   r	   r.   Zsoftmaxr   r   r    r   reshaper   )rF   r   r   r   r   r%   r   r|   Zquery_statesZ
key_statesZvalue_statesZ
proj_shaper&   attn_weightsZattn_weights_reshapedZ
attn_probsZattn_outputr'   r'   r(   r   y  sZ    	





zOwlViTAttention.forward)NNF)rK   rL   rM   rN   rz   r!   r   rU   r   r   r"   r   r   r   r'   r'   r   r(   r   `  s      r   c                       s0   e Zd Z fddZejejdddZ  ZS )	OwlViTMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S rQ   )ry   rz   rr   r   Z
hidden_actactivation_fnr	   r   r{   Zintermediate_sizefc1fc2r   r   r'   r(   rz     s
    
zOwlViTMLP.__init__)r   r+   c                 C   s"   |  |}| |}| |}|S rQ   )r   r   r   )rF   r   r'   r'   r(   r     s    


zOwlViTMLP.forward)rK   rL   rM   rz   r!   r   r   r   r'   r'   r   r(   r     s   r   c                       sJ   e Zd Zed fddZdejejejee e	ej
 dddZ  ZS )	OwlViTEncoderLayerrq   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S N)Zeps)ry   rz   r{   r|   r   	self_attnr	   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   r   r'   r(   rz     s    


zOwlViTEncoderLayer.__init__Fr   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r`||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r   r   )r   r   r   r   )rF   r   r   r   r   Zresidualr   outputsr'   r'   r(   r     s"    




zOwlViTEncoderLayer.forward)F)rK   rL   rM   r   rz   r!   r   r   r"   r   rO   r   r   r'   r'   r   r(   r     s    r   c                   @   s4   e Zd ZdZeZdZdZdgZdd Z	ddd	Z
d
S )OwlViTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    owlvitTr   c                 C   sX  | j j}t|trF|jjjjd|d d |jjjjd|d d nt|t	r| j j}t
jj|jd|jd | d t
jj|jj|j j| d t
jj|jj|j j| d nTt|trD| j j}|jd d|j j d  | }|jd | }t
jj|jj|d t
jj|jj|d t
jj|jj|d t
jj|jj|d nt|tr| j j}|j jd d|j j d  | }d|j j d | }t
jj|jj|d t
jj|jj|d nPt|trt
jj|jj|jd | j j d t
jj|jj|jd | j j d t|t
jr.|j j!  |jj"d t|t
j#rT|j dk	rT|j j!  dS )	zInitialize the weights        g{Gz?)meanstdr   )r   rX   r   N)$rr   Zinitializer_factor
isinstancer   r   weightdataZnormal_r   rp   r	   initr~   r|   r   Zinitializer_ranger   num_hidden_layersr   r   r   r   r   r{   r   r   OwlViTModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   rs   Zzero_Zfill_r   )rF   modulefactorZin_proj_stdZout_proj_stdZfc_stdr'   r'   r(   _init_weights  sL    

 z#OwlViTPreTrainedModel._init_weightsFc                 C   s   t |tr||_d S rQ   )r   OwlViTEncodergradient_checkpointing)rF   r   valuer'   r'   r(   _set_gradient_checkpointingC  s    
z1OwlViTPreTrainedModel._set_gradient_checkpointingN)F)rK   rL   rM   rN   r   config_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesr   r   r'   r'   r'   r(   r     s   *r   aP  
    Parameters:
    This model is a PyTorch [torch.nn.Module](https:
        //pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.
        config ([`OwlViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, num_max_text_queries, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values.
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, *optional*):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids).
        attention_mask (`torch.Tensor` of shape `(batch_size, num_max_text_queries, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the last hidden state. See `text_model_last_hidden_state` and
            `vision_model_last_hidden_state` under returned tensors for more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a_  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values.
        query_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values of query image(s) to be detected. Pass in one query image per target image.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                	       s`   e Zd ZdZed fddZd	eej eej ee	 ee	 ee	 e
eef dddZ  ZS )
r   z
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`OwlViTEncoderLayer`].

    Args:
        config: OwlViTConfig
    rq   c                    s4   t    t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r'   )r   )rC   _rq   r'   r(   
<listcomp>  s     z*OwlViTEncoder.__init__.<locals>.<listcomp>F)ry   rz   r	   Z
ModuleListranger   layersr   r   r   rq   r(   rz     s    
 zOwlViTEncoder.__init__N)r   r   r   output_hidden_statesreturn_dictr+   c                    s   dk	r n| j j |dk	r |n| j j}|dk	r4|n| j j}|rDdnd} rPdnd}|}	| jD ]n}
|rp||	f }| jr| jr fdd}tjj		||
|	||}n|
|	|| d}|d }	 r^||d f }q^|r||	f }|st
dd	 |	||fD S t|	||d
S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`).
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr'   c                    s    fdd}|S )Nc                     s    | f S rQ   r'   )inputs)r   r   r'   r(   custom_forward  s    zLOwlViTEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr'   )r   r   r   )r   r(   create_custom_forward  s    z4OwlViTEncoder.forward.<locals>.create_custom_forwardr   r   r   c                 s   s   | ]}|d k	r|V  qd S rQ   r'   )rC   vr'   r'   r(   rG     s      z(OwlViTEncoder.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)rr   r   r   use_return_dictr   r   r   r!   utils
checkpointrI   r   )rF   r   r   r   r   r   r   Zencoder_statesZall_attentionsr   Zencoder_layerr   Zlayer_outputsr'   r   r(   r     sH    


  zOwlViTEncoder.forward)NNNNN)rK   rL   rM   rN   r   rz   r   r!   r   r"   r   r   r   r   r   r'   r'   r   r(   r     s        
r   )input_ids_shaper   r-   past_key_values_lengthc                 C   s   | \}}t j||ft |j|d}t j|d|d}|||d |ddk d ||}|dkrt j	t j
||||d|gdd}|ddddddf |d||| S )zB
    Make causal mask used for bi-directional self-attention.
    r,   rv   r   r   )r   r-   r   N)r!   fullr#   r$   r/   r   Zmasked_fill_r   r    r   zerosr   )r   r   r-   r   r%   r   r   Z	mask_condr'   r'   r(   _make_causal_mask  s    "
 r   c                       st   e Zd Zed fddZeeeeedd	e	j
ee	j
 ee	j
 ee ee ee eeef dddZ  ZS )
OwlViTTextTransformerrq   c                    s@   t    || _|j}t|| _t|| _tj	||j
d| _d S r   )ry   rz   rr   r{   r   r   r   encoderr	   r   r   final_layer_norm)rF   rr   r|   r   r'   r(   rz   &  s    


zOwlViTTextTransformer.__init__output_typer   N)r   r   rt   r   r   r   r+   c                 C   s  |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| }|d|d }| j||d}t||j|j	d}	|dk	rt
||j}| j|||	|||d}
|
d }| |}|tj|jd |j	d|tjjdd|j	f }|s||f|
dd  S t|||
j|
jd	S )

        Returns:
        Nrv   )r   rt   r,   )r   r   r   r   r   r   r   r   r   r   Zpooler_outputr   r   )rr   r   r   r   r   r   r   r   r   r-   r)   r   r   r!   r/   r   r    rU   Zargmaxr   r   r   )rF   r   r   rt   r   r   r   Zinput_shaper   r   encoder_outputsr   pooled_outputr'   r'   r(   r   .  sB    	
zOwlViTTextTransformer.forward)NNNNN)rK   rL   rM   r   rz   r   OWLVIT_TEXT_INPUTS_DOCSTRINGr   r   r!   r   r   r"   r   r   r   r   r'   r'   r   r(   r   %  s"   
     
r   c                       s   e Zd ZeZed fddZejdddZdd Z	e
eeeed	dejeej ee ee ee eeef dddZ  ZS )OwlViTTextModelrq   c                    s"   t  | t|| _|   d S rQ   )ry   rz   r   
text_model	post_initr   r   r'   r(   rz   p  s    
zOwlViTTextModel.__init__r=   c                 C   s
   | j jjS rQ   r   r   r   rE   r'   r'   r(   get_input_embeddingsv  s    z$OwlViTTextModel.get_input_embeddingsc                 C   s   || j j_d S rQ   r   )rF   r   r'   r'   r(   set_input_embeddingsy  s    z$OwlViTTextModel.set_input_embeddingsr   Nr   r   r   r   r   r+   c                 C   s   | j |||||dS )ay  
        Returns:

        Examples:
        ```python
        >>> from transformers import AutoProcessor, OwlViTTextModel

        >>> model = OwlViTTextModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> inputs = processor(
        ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
        ... )
        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   r   r   r   )r   )rF   r   r   r   r   r   r'   r'   r(   r   |  s    zOwlViTTextModel.forward)NNNN)rK   rL   rM   r   r   rz   r	   Moduler   r   r   r   r   r   r!   r   r   r"   r   r   r   r   r'   r'   r   r(   r   m  s$   
    
r   c                
       sd   e Zd Zed fddZeeeeedd	e	j
ee ee ee eeef dddZ  ZS )
OwlViTVisionTransformerrq   c                    sP   t    || _t|| _tj|j|jd| _	t
|| _tj|j|jd| _d S r   )ry   rz   rr   rp   r   r	   r   r{   r   pre_layernormr   r   post_layernormr   r   r'   r(   rz     s    


z OwlViTVisionTransformer.__init__r   Nr   r   r   r   r+   c           
      C   s   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| jjjj}||}| |}| 	|}| j
||||d}|d }|dddddf }	| |	}	|s||	f|dd  S t||	|j|jdS )r   N)r   r   r   r   r   r   r   )rr   r   r   r   r   r   r   r   r    r   r   r   r   r   r   )
rF   r   r   r   r   Zexpected_input_dtyper   r   r   r   r'   r'   r(   r     s2    



zOwlViTVisionTransformer.forward)NNN)rK   rL   rM   r   rz   r   OWLVIT_VISION_INPUTS_DOCSTRINGr   r   r!   rO   r   r"   r   r   r   r   r'   r'   r   r(   r     s   	
   
r   c                
       s   e Zd ZeZdZed fddZejdddZ	e
eeeeddeej ee ee ee eeef d
ddZ  ZS )OwlViTVisionModelr   rq   c                    s"   t  | t|| _|   d S rQ   )ry   rz   r   vision_modelr   r   r   r'   r(   rz     s    
zOwlViTVisionModel.__init__r=   c                 C   s
   | j jjS rQ   )r  r   r   rE   r'   r'   r(   r     s    z&OwlViTVisionModel.get_input_embeddingsr   Nr   c                 C   s   | j ||||dS )a  
        Returns:

        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, OwlViTVisionModel

        >>> model = OwlViTVisionModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r   r   )r  )rF   r   r   r   r   r'   r'   r(   r     s    zOwlViTVisionModel.forward)NNNN)rK   rL   rM   r   r   Zmain_input_namerz   r	   r   r   r   r  r   r   r   r!   rO   r"   r   r   r   r   r'   r'   r   r(   r    s"   
    
r  c                       s   e Zd ZeZed fddZeedee	j
 ee	j
 ee ee ee e	jdddZeedee	j ee ee ee e	jdd	d
Zeeeeeddee	j ee	j ee	j
 ee ee ee ee ee eeef d	ddZ  ZS )r   rq   c                    s   t  | t|jts.tdt|j dt|jtsPtdt|j d|j}|j}|j	| _	|j
| _|j
| _t|| _t|| _tj| j| j	dd| _tj| j| j	dd| _tt|j| _|   d S )NzMconfig.text_config is expected to be of type OwlViTTextConfig but is of type .zQconfig.vision_config is expected to be of type OwlViTVisionConfig but is of type F)rs   )ry   rz   r   text_configr   rd   typevision_configr   Zprojection_dimr{   r   r   r   r   r   r  r	   r   r   r   r}   r!   r   Zlogit_scale_init_valuelogit_scaler   )rF   rr   r  r  r   r'   r(   rz     s(    

zOwlViTModel.__init__Nr   c           	      C   s:   |dk	r|n| j j}| j|||d}|d }| |}|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`OwlViTTextModel`].

        Examples:
        ```python
        >>> from transformers import AutoProcessor, OwlViTModel

        >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> inputs = processor(
        ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
        ... )
        >>> text_features = model.get_text_features(**inputs)
        ```N)r   r   r   r   )rr   r   r   r   )	rF   r   r   r   r   r   Ztext_outputr   Ztext_featuresr'   r'   r(   get_text_features1  s
    
zOwlViTModel.get_text_featuresr   c                 C   sd   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| j||||d}|d }| |}|S )aB  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`OwlViTVisionModel`].

        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, OwlViTModel

        >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")
        >>> image_features = model.get_image_features(**inputs)
        ```Nr  r   )rr   r   r   r   r  r   )rF   r   r   r   r   vision_outputsr   image_featuresr'   r'   r(   get_image_featuresT  s    
zOwlViTModel.get_image_featuresr   )	r   r   r   return_lossr   r   return_base_image_embedsr   r+   c	              	   C   sd  |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| j||||d}	| j|||||d}
|
d }| |}|	d }| |}|tj	j
|dddd }|tj	j
|dddd }| j |j}t|| | }| }d}|rt|}|rtd	t |	d
 }| j|}n|}|sN|||||
|	f}|dk	rJ|f| S |S t||||||
|	dS )a[  
        Returns:

        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, OwlViTModel

        >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(text=[["a photo of a cat", "a photo of a dog"]], images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr  r   r   rX   rv   T)ordr   keepdimz`return_base_image_embeds` is deprecated and will be removed in v4.27 of Transformers, one can obtain the base (unprojected) image embeddings from outputs.vision_model_output.r   )r6   r7   r8   r9   r:   r;   r<   )rr   r   r   r   r  r   r   r   r!   linalgnormr	  expr    r-   matmulr3   r4   warningswarnFutureWarningr   r5   )rF   r   r   r   r  r   r   r  r   r  text_outputsr9   r:   Ztext_embeds_normr	  r8   r7   r6   r   outputr'   r'   r(   r     sb     

zOwlViTModel.forward)NNNNN)NNNN)NNNNNNNN)rK   rL   rM   r   r   rz   r   r   r   r!   r   r"   rO   r
  r  r  OWLVIT_INPUTS_DOCSTRINGr   r5   r   r   r   r   r   r'   r'   r   r(   r     sb         "    ,
        
r   c                       s6   e Zd Zed fddZejejdddZ  Z	S )OwlViTBoxPredictionHeadrq   c                    sJ   t    |jj}t||| _t||| _t | _	t|d| _
d S )N   )ry   rz   r  r{   r	   r   dense0dense1ZGELUgeludense2)rF   rr   widthr   r'   r(   rz     s    

z OwlViTBoxPredictionHead.__init__)r  r+   c                 C   s6   |  |}| |}| |}| |}| |}|S rQ   )r  r   r  r!  )rF   r  r  r'   r'   r(   r     s    




zOwlViTBoxPredictionHead.forward)
rK   rL   rM   r   rz   r!   r   rO   r   r   r'   r'   r   r(   r    s   	r  c                       sJ   e Zd Zed fddZejeej eej e	ej dddZ
  ZS )OwlViTClassPredictionHeadrq   c                    sZ   t    |jj}|jj| _t| j|| _t| jd| _	t| jd| _
t | _d S )Nr   )ry   rz   r  r{   r  	query_dimr	   r   r  logit_shiftr	  ZELUelu)rF   rr   Zout_dimr   r'   r(   rz     s    

z"OwlViTClassPredictionHead.__init__)r:   query_embeds
query_maskr+   c                 C   s  |  |}|d krJ|j}|jd d \}}t||| jf|}||fS |tjj|dddd  }|tjj|dddd  }t	d||}| 
|}	| |}
| |
d }
||	 |
 }|d k	r|jdkrtj|dd	}|tj}t|d
kd|}|tj}||fS )NrX   rv   T)r   r  gư>z...pd,...qd->...pqr   r   r   r   g    .)r  r-   r   r!   r   r$  r    r  r  einsumr%  r	  r&  ndimZ	unsqueezerS   whererR   )rF   r:   r'  r(  image_class_embedsr-   r   r   pred_logitsr%  r	  r'   r'   r(   r     s(    




z!OwlViTClassPredictionHead.forward)rK   rL   rM   r   rz   r!   rO   r   r   r   r   r   r'   r'   r   r(   r#    s   r#  c                       s  e Zd ZeZed fddZejdddZejejddd	Z	ejejejd
ddZ
d!ejeej eej eej dddZd"ejejejee ee eej dddZd#ejee ee eej dddZejejejdddZeeeeedd$ejeej ee ee ee edddZeeeeedd%ejejeej ee ee ee eddd Z  ZS )&OwlViTForObjectDetectionrq   c                    sP   t  | t|| _t|| _t|| _tj	|j
j|j
jd| _t | _d S r   )ry   rz   r   r   r#  
class_headr  box_headr	   r   r  r{   r   
layer_normZSigmoidsigmoidr   r   r'   r(   rz   +  s    


z!OwlViTForObjectDetection.__init__)feature_mapc              	   C   s   |j dkstd|j}|jd }tjttd|d td|d ddtj	}|t
||gtj	 }||jd |jd  |jd }t||}|S )Nr  zJExpected input shape is [batch_size, num_patches, num_patches, hidden_dim]r   rv   Zaxisr   rX   )r*  rd   r-   r   npstackZmeshgridr/   ZastyperR   arrayr   r!   Z
from_numpyr    )rF   r3  r-   r   box_coordinatesr'   r'   r(   !normalize_grid_corner_coordinates5  s"    

"  z:OwlViTForObjectDetection.normalize_grid_corner_coordinates)r3  r+   c                 C   s   |  |}t|dd}t|d t| d  }t|d|jd  }t|d t| d  }tj||gdd}|S )Nr   r   g-C6?r   rv   r   )r9  r!   Zcliploglog1pZ	full_liker   r   )rF   r3  r8  Zbox_coord_biasZbox_sizeZbox_size_biasZbox_biasr'   r'   r(   compute_box_biasJ  s    
z)OwlViTForObjectDetection.compute_box_bias)image_featsr3  r+   c                 C   s&   |  |}|| |7 }| |}|S )a  
        Args:
            image_feats:
                Features extracted from the image, returned by the `image_text_embedder` method.
            feature_map:
                A spatial re-arrangement of image_features, also returned by the `image_text_embedder` method.
        Returns:
            pred_boxes:
                List of predicted boxes (cxcywh normalized to 0, 1) nested within a dictionary.
        )r0  r<  r2  )rF   r=  r3  rj   r'   r'   r(   box_predictorZ  s    

z&OwlViTForObjectDetection.box_predictorN)r=  r'  r(  r+   c                 C   s   |  |||\}}||fS )a8  
        Args:
            image_feats:
                Features extracted from the `image_text_embedder`.
            query_embeds:
                Text query embeddings.
            query_mask:
                Must be provided with query_embeddings. A mask indicating which query embeddings are valid.
        )r/  )rF   r=  r'  r(  r-  r,  r'   r'   r(   class_predictorq  s    z(OwlViTForObjectDetection.class_predictor)r   r   r   r   r   r+   c                 C   s   | j |||||dd}|jd }| j j|}tt|jtd }	t	|d d d dd d f |	}
|d d dd d d f |
 }| 
|}|jd tt|jd tt|jd |jd f}	||	}|d }|||fS )NT)r   r   r   r   r   r   r   r   r   r   r   rv   )r   r<   r  r   rI   r5  r7  r   r!   broadcast_tor1  rU   sqrtr   )rF   r   r   r   r   r   r   r   r:   new_sizeclass_token_outr9   r'   r'   r(   image_text_embedder  s,    	

"

z,OwlViTForObjectDetection.image_text_embedder)r   r   r   r+   c           	      C   s   | j j|dd}|d }| j j|}tt|jtd }t|d d d dd d f |}|d d dd d d f | }| 	|}|jd t
t|jd t
t|jd |jd f}||}||fS )NT)r   r   r   r@  r   rv   )r   r  r   rI   r5  r7  r   r!   rB  r1  rU   rC  r   )	rF   r   r   r   r  r   r:   rD  rE  r'   r'   r(   image_embedder  s    "

z'OwlViTForObjectDetection.image_embedder)query_image_featuresquery_feature_mapr+   c                 C   s>  |  |\}}| ||}t|}g }g }|j}	t|jd D ]}
tjddddgg|	d}||
 }t||\}}t	|d dkrt
||}t|d }|d |k }| r>||
 |d }tj||
 dd}td||}|t| }|||
 |  || q>|r,t|}t|}nd\}}|||fS )	Nr   r   r,   r   g?r4  zd,id->i)NN)r?  r>  r   r-   r   r   r!   r   rb   rc   rg   r[   ZnonzeroZnumelZsqueezer   r)  Zargminappendr6  )rF   rH  rI  r   rk   rj   Zpred_boxes_as_cornersZbest_class_embedsbest_box_indicesZpred_boxes_deviceiZeach_query_boxZeach_query_pred_boxesZiousZiou_thresholdZselected_indsZselected_embeddingsZmean_embedsZmean_simZbest_box_indr'  Zbox_indicesr'   r'   r(   embed_image_query  s4    

z*OwlViTForObjectDetection.embed_image_queryr   )r   query_pixel_valuesr   r   r   r+   c              
   C   s"  |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| j|dd }| j|||d\}}|j\}	}
}
}t||	|
|
 |f}|j\}	}
}
}t||	|
|
 |f}| ||\}}}| j	||d\}}| 
||}|s
||||||| f}tdd |D }|S t||||||d|dS )	a3  
        Returns:

        Examples:
        ```python
        >>> import requests
        >>> from PIL import Image
        >>> import torch
        >>> from transformers import AutoProcessor, OwlViTForObjectDetection

        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch16")
        >>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch16")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> query_url = "http://images.cocodataset.org/val2017/000000001675.jpg"
        >>> query_image = Image.open(requests.get(query_url, stream=True).raw)
        >>> inputs = processor(images=image, query_images=query_image, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model.image_guided_detection(**inputs)
        >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
        >>> target_sizes = torch.Tensor([image.size[::-1]])
        >>> # Convert outputs (bounding boxes and class logits) to COCO API
        >>> results = processor.post_process_image_guided_detection(
        ...     outputs=outputs, threshold=0.6, nms_threshold=0.3, target_sizes=target_sizes
        ... )
        >>> i = 0  # Retrieve predictions for the first image
        >>> boxes, scores = results[i]["boxes"], results[i]["scores"]
        >>> for box, score in zip(boxes, scores):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(f"Detected similar object with confidence {round(score.item(), 3)} at location {box}")
        Detected similar object with confidence 0.856 at location [10.94, 50.4, 315.8, 471.39]
        Detected similar object with confidence 1.0 at location [334.84, 25.33, 636.16, 374.71]
        ```N)r   r   )r   r   r   )r=  r'  c                 s   s   | ]}|d k	r|V  qd S rQ   r'   rC   xr'   r'   r(   rG   H  s      zBOwlViTForObjectDetection.image_guided_detection.<locals>.<genexpr>)r:   rm   rn   ro   r*   rk   r;   r<   )rr   r   r   r   rG  r   r!   r   rM  r?  r>  rA   rI   rl   )rF   r   rN  r   r   r   rI  r3  r  r   r   
hidden_dimr=  Zquery_image_featsr'  rK  ro   r-  rk   rn   r  r'   r'   r(   image_guided_detection  sL    +
	z/OwlViTForObjectDetection.image_guided_detection)r   r   r   r   r   r   r+   c              	   C   s2  |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| j|||||d\}}}	|	j}
|	j}|j\}}}}t	|||| |f}|jd | }|	|||jd }|	|||jd }|d dk}| 
|||\}}| ||}|s||||||
 | f}tdd |D }|S t||||||
|dS )	a  
        Returns:

        Examples:
        ```python
        >>> import requests
        >>> from PIL import Image
        >>> import torch
        >>> from transformers import AutoProcessor, OwlViTForObjectDetection

        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> texts = [["a photo of a cat", "a photo of a dog"]]
        >>> inputs = processor(text=texts, images=image, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
        >>> target_sizes = torch.Tensor([image.size[::-1]])
        >>> # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
        >>> results = processor.post_process_object_detection(
        ...     outputs=outputs, threshold=0.1, target_sizes=target_sizes
        ... )

        >>> i = 0  # Retrieve predictions for the first image for the corresponding text queries
        >>> text = texts[i]
        >>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

        >>> for box, score, label in zip(boxes, scores, labels):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
        Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
        Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
        ```N)r   r   r   r   r   r   rv   ).r   c                 s   s   | ]}|d k	r|V  qd S rQ   r'   rO  r'   r'   r(   rG     s      z3OwlViTForObjectDetection.forward.<locals>.<genexpr>)r:   r9   rj   r*   rk   r;   r<   )rr   r   r   r   rF  r;   r<   r   r!   r   r?  r>  rA   rI   rh   )rF   r   r   r   r   r   r   r'  r3  r   r  r  r   r   rQ  r=  Zmax_text_queriesr(  r-  rk   rj   r  r'   r'   r(   r   V  sR    /		z OwlViTForObjectDetection.forward)NN)NN)NN)NNNN)NNNN)rK   rL   rM   r   r   rz   r!   rO   r9  r<  r>  r   r   r   r?  r"   rF  rG  rM  r   5OWLVIT_IMAGE_GUIDED_OBJECT_DETECTION_INPUTS_DOCSTRINGr   rl   rR  (OWLVIT_OBJECT_DETECTION_INPUTS_DOCSTRINGrh   r   r   r'   r'   r   r(   r.  (  s   
    -  ! )
    ]
    r.  )N)r   )NrN   r  dataclassesr   typingr   r   r   r   r   numpyr5  r!   Ztorch.utils.checkpointr   r	   Zactivationsr   Zmodeling_outputsr   r   Zmodeling_utilsr   r   r   r   r   r   r   r   Zconfiguration_owlvitr   r   r   Ztransformers.image_transformsr   Z
get_loggerrK   loggerZ_CHECKPOINT_FOR_DOCZ$OWLVIT_PRETRAINED_MODEL_ARCHIVE_LISTr   rU   r)   r1   r4   r5   rV   rY   rb   rg   rh   rl   r   rp   r   r   r   r   r   ZOWLVIT_START_DOCSTRINGr   r  r  rT  rS  r   Sizer-   r   r   r   r   r  r   r  r#  r.  r'   r'   r'   r(   <module>   s~    
&	10 l2:a    H4:2 W1