U
    9%e<                    @   s  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZmZmZmZmZmZ ddlZddlZddlZddlmZ ddlmZmZmZ d	d
lmZ d	dlmZ d	dlm Z  d	dl!m"Z"m#Z#m$Z$m%Z% d	dl&m'Z'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- ee.e/f Z0eej1eej1 ej1f Z2ede2f Z3ede
f Z4e*5e6Z7dZ8dZ9dgZ:eG dd de'Z;eG dd de'Z<eG dd de'Z=eG dd de'Z>G dd dej?Z@G dd dej?ZAG d d! d!ej?ZBG d"d# d#ej?ZCG d$d% d%ej?ZDG d&d' d'ej?ZEG d(d) d)ej?ZFG d*d+ d+e ZGd,ZHd-ZId.ZJe(d/eIG d0d1 d1eGZKe(d2eHG d3d4 d4eGZLe(d5eHG d6d7 d7eGZMe(d8eHG d9d: d:eGZNe(d;eHG d<d= d=eGZOe(d>eHG d?d@ d@eGZPe(dAeHG dBdC dCeGZQe(dDeHG dEdF dFeGZRddHdIZSG dJdK dKej?ejTdLZUG dMdN dNeUZVG dOdP dPeUZWG dQdR dReUZXG dSdT dTeUZYG dUdV dVeUZZe0ej1ee.ej1f dWdXdYZ[G dZd[ d[eUZ\dej1e/e/ej1d\d]d^Z]G d_d` d`ej^Z_G dadb dbej?Z`ddfdgZaddidjZbG dkdl dlej?ejTdLZcG dmdn dnecZddodp ZeG dqdr drecZfG dsdt dtej?ZgG dudv dvegZhG dwdx dxej?ZiG dydz dzej?ZjG d{d| d|ej?ZkG d}d~ d~ej?ZlG dd dej?ZmG dd degZnG dd degZoG dd degZpG dd degZqdS )z PyTorch Perceiver model.    N)	dataclass)reduce)__add__)AnyCallableDictListMappingOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)"BaseModelOutputWithCrossAttentions)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesmeshgridprune_linear_layer)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )PerceiverConfig.zdeepmind/language-perceiverr   c                   @   sp   e Zd ZU dZdZejed< dZejed< dZ	e
eej  ed< dZe
eej  ed< dZe
eej  ed< dS )PerceiverModelOutputa  
    Base class for Perceiver base model's outputs, with potential hidden states, attentions and cross-attentions.

    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
            used to compute the weighted average in the cross-attention heads.
    Nlogitslast_hidden_statehidden_states
attentionscross_attentions)__name__
__module____qualname____doc__r!   torchFloatTensor__annotations__r"   r#   r
   r   r$   r%    r-   r-   o/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/perceiver/modeling_perceiver.pyr    <   s   
r    c                   @   s6   e Zd ZU dZdZejed< dZe	e
ej  ed< dS )PerceiverDecoderOutputa  
    Base class for Perceiver decoder outputs, with potential cross-attentions.

    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
            Output of the basic decoder.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
            used to compute the weighted average in the cross-attention heads.
    Nr!   r%   )r&   r'   r(   r)   r!   r*   r+   r,   r%   r
   r   r-   r-   r-   r.   r/   [   s   
r/   c                   @   st   e Zd ZU dZdZeej ed< dZ	ejed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dS )PerceiverMaskedLMOutputa  
    Base class for Perceiver's masked language model outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Masked language modeling (MLM) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_latents,
            num_latents)`. Attentions weights after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
            used to compute the weighted average in the cross-attention heads.
    Nlossr!   r#   r$   r%   r&   r'   r(   r)   r1   r
   r*   r+   r,   r!   r#   r   r$   r%   r-   r-   r-   r.   r0   m   s   
r0   c                   @   st   e Zd ZU dZdZeej ed< dZ	ejed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dS )PerceiverClassifierOutputa  
    Base class for Perceiver's outputs of sequence/image classification models, optical flow and multimodal
    autoencoding.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
            used to compute the weighted average in the cross-attention heads.
    Nr1   r!   r#   r$   r%   r2   r-   r-   r-   r.   r3      s   
r3   c                       s.   e Zd ZdZ fddZedddZ  ZS )PerceiverEmbeddingsz Construct the latent embeddings.c                    s&   t    tt|j|j| _d S N)	super__init__r   	Parameterr*   randnZnum_latents	d_latentslatentsselfconfig	__class__r-   r.   r7      s    
zPerceiverEmbeddings.__init__
batch_sizec                 C   s   | j |ddS N)r;   expand)r=   rB   r-   r-   r.   forward   s    zPerceiverEmbeddings.forward)r&   r'   r(   r)   r7   intrF   __classcell__r-   r-   r?   r.   r4      s   r4   c                	       sj   e Zd ZdZd fdd	Zdd Zdejeej	 eej	 eej	 eej	 ee
 eej d	d
dZ  ZS )PerceiverSelfAttentionz`Multi-headed {cross, self}-attention. Can be used both in the encoder as well as in the decoder.FNr   c                    s   t    || _|d kr|}|d kr(|}|| dkrJtd| d| d|| dkrltd| d| d|| _|| _| j| | _| j| | _t	|| _
|rt	|nt | _t||| _t||| _t||| _t|j| _d S )Nr   zqk_channels (z") must be divisible by num_heads ().zv_channels ()r6   r7   	num_heads
ValueErrorqk_channels
v_channelsqk_channels_per_headv_channels_per_headr   	LayerNorm
layernorm1Identity
layernorm2LinearquerykeyvalueZDropoutZattention_probs_dropout_probdropout)r=   r>   is_cross_attentionrM   rN   rK   q_dimkv_dimr?   r-   r.   r7      s(    

zPerceiverSelfAttention.__init__c                 C   s4   |  d d | j|f }|j| }|ddddS )NrD   r      r   r   )sizerK   viewpermute)r=   xZchannels_per_headZnew_x_shaper-   r-   r.   transpose_for_scores   s    
z+PerceiverSelfAttention.transpose_for_scoresr#   attention_mask	head_maskinputsinputs_maskoutput_attentionsreturnc                 C   s^  |  |}| |}|d k	}| |}|rD| |}	| |}
|}n| |}	| |}
| || j}| |	| j}	| |
| j}
t	||	
dd}|j\}}}}|
j\}}}}| j| }|t| }|d k	r|| }tjdd|}| |}|d k	r|| }t	||
}|dddd }| d d |f }|j| }|rT||fn|f}|S )NrD   dimr   r]   r   r   )rR   rT   rV   rW   rX   rb   rO   rP   r*   matmul	transposeshaperK   mathsqrtr   ZSoftmaxrY   r`   
contiguousr^   r_   )r=   r#   rd   re   rf   rg   rh   rZ   ZquerieskeysvaluesZattention_scoresrB   rK   seq_lenZ
q_head_dim_Z
v_head_dimZhiddensZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr-   r-   r.   rF      s<    	










zPerceiverSelfAttention.forward)FNNr   NN)NNNNF)r&   r'   r(   r)   r7   rb   r*   Tensorr
   r+   boolr   rF   rH   r-   r-   r?   r.   rI      s.         )     rI   c                       s0   e Zd Z fddZejejdddZ  ZS )PerceiverSelfOutputc                    s   t    t||| _d S r5   )r6   r7   r   rU   dense)r=   r>   input_channelsoutput_channelsr?   r-   r.   r7   +  s    
zPerceiverSelfOutput.__init__r#   ri   c                 C   s   |  |}|S r5   )r{   r=   r#   r-   r-   r.   rF   /  s    
zPerceiverSelfOutput.forward)r&   r'   r(   r7   r*   rx   rF   rH   r-   r-   r?   r.   rz   *  s   rz   c                	       sj   e Zd ZdZd fdd	Zdd	 Zdejeej	 eej	 eej	 eej	 ee
 eej d
ddZ  ZS )PerceiverAttentionz*Attention module, including a dense block.FNr   Tc	           
   	      s   t    |rJ|d krJ|jdkr&|}qb|jdkr6|}qbtd|j dn|d krV|}|d krb|}t|||||||d| _d }	|r|}	n|	d kr|}	t|| jj|	d| _|| _	t
 | _d S )NqkvzUnknown value z) for cross_attention_shape_for_attention.)rZ   rM   rN   rK   r[   r\   )r|   r}   )r6   r7   Z#cross_attention_shape_for_attentionrL   rI   r=   rz   rN   outputuse_query_residualsetpruned_heads)
r=   r>   rZ   rM   rN   rK   r[   r\   r   r}   r?   r-   r.   r7   7  s<    



zPerceiverAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rk   )lenr   r=   Znum_attention_headsZattention_head_sizer   r   rV   rW   rX   r   r{   Zall_head_sizeunion)r=   headsindexr-   r-   r.   prune_headsg  s       zPerceiverAttention.prune_headsrc   c           
      C   sF   |  ||||||}| |d }| jr0|| }|f|dd   }	|	S Nr   r   )r=   r   r   )
r=   r#   rd   re   rf   rg   rh   Zself_outputsattention_outputrw   r-   r-   r.   rF   y  s    	
zPerceiverAttention.forward)FNNr   NNT)NNNNF)r&   r'   r(   r)   r7   r   r*   rx   r
   r+   ry   r   rF   rH   r-   r-   r?   r.   r   4  s0          0     r   c                       s4   e Zd ZdZ fddZejejdddZ  ZS )PerceiverMLPz5A Transformer-style dense module to follow attention.c                    sT   t    t||| | _t|jtr6t|j | _	n|j| _	t|| || _
d S r5   )r6   r7   r   rU   dense1
isinstanceZ
hidden_actstrr   intermediate_act_fndense2)r=   r>   
input_sizewidening_factorr?   r-   r.   r7     s    
zPerceiverMLP.__init__r~   c                 C   s"   |  |}| |}| |}|S r5   )r   r   r   r   r-   r-   r.   rF     s    


zPerceiverMLP.forward)	r&   r'   r(   r)   r7   r*   rx   rF   rH   r-   r-   r?   r.   r     s   	r   c                	       sf   e Zd Zd fdd	Zdejeej eej eej eej ee e	ej dd	d
Z
dd Z  ZS )PerceiverLayerFNr      Tc
           
   
      sR   t    |j| _d| _t||||||||	d| _t|| _t	|||d| _
d S )Nr   )rZ   rM   rN   rK   r[   r\   r   )r   r   )r6   r7   chunk_size_feed_forwardseq_len_dimr   	attentionr   rQ   	layernormr   mlp)
r=   r>   rZ   rM   rN   rK   r[   r\   r   r   r?   r-   r.   r7     s    

zPerceiverLayer.__init__rc   c                 C   sR   |  ||||||}|d }|dd  }	t| j| j| j|}
|
| }
|
f|	 }	|	S r   )r   r   feed_forward_chunkr   r   )r=   r#   rd   re   rf   rg   rh   Zattention_outputsr   rw   layer_outputr-   r-   r.   rF     s&    	   
zPerceiverLayer.forwardc                 C   s   |  |}| |}|S r5   )r   r   )r=   r   r   r-   r-   r.   r     s    

z!PerceiverLayer.feed_forward_chunk)FNNr   NNr   T)NNNNF)r&   r'   r(   r7   r*   rx   r
   r+   ry   r   rF   r   rH   r-   r-   r?   r.   r     s0                r   c                       sp   e Zd ZdZd
 fdd	Zdejeej eej eej eej ee	 ee	 ee	 e
eef d	dd	Z  ZS )PerceiverEncoderz=The Perceiver Encoder: a scalable, fully attentional encoder.Nc                    s   t    || _|j|j dkr:td|j d|j d|j|j dkrdtd|j d|j dt|d|j|j	|j|j||j
|jd	| _g }t|jD ]2}t|d|j|j	|j|j|j|jd	}|| qt|| _d S )
Nr   znum_z_channels (z.) must be divisible by num_self_attend_heads (rJ   z/) must be divisible by num_cross_attend_heads (TrZ   rM   rN   rK   r[   r\   r   r   F)rZ   rM   rN   rK   r[   r\   r   )r6   r7   r>   r:   Znum_self_attention_headsrL   Znum_cross_attention_headsr   rM   rN   Zcross_attention_widening_factorr   cross_attentionrangenum_self_attends_per_blockZself_attention_widening_factorappendr   Z
ModuleListself_attends)r=   r>   r\   Zself_attention_layersrv   layerr?   r-   r.   r7     sF    

zPerceiverEncoder.__init__FT)	r#   rd   re   rf   rg   rh   output_hidden_statesreturn_dictri   c	                 C   s  |rdnd }	|rdnd }
|r dnd }| j ||d |||d}|d }|rT||d f }t| jjD ]r}t| jD ]T\}}|r|	|f }	|d k	r|| nd }|||||d}|d }|rn|
|d f }
qn|r`|	|f }	q`|stdd ||	|
|fD S t||	|
|dS )	Nr-   rd   re   rf   rg   rh   r   r   )rd   re   rh   c                 s   s   | ]}|d k	r|V  qd S r5   r-   ).0vr-   r-   r.   	<genexpr>S  s   z+PerceiverEncoder.forward.<locals>.<genexpr>)r"   r#   r$   r%   )r   r   r>   
num_blocks	enumerater   tupler   )r=   r#   rd   re   rf   rg   rh   r   r   Zall_hidden_statesZall_self_attentionsZall_cross_attentionslayer_outputsrv   iZlayer_moduleZlayer_head_maskr-   r-   r.   rF     sP    

zPerceiverEncoder.forward)N)NNNNFFT)r&   r'   r(   r)   r7   r*   rx   r
   r+   ry   r   r   r   rF   rH   r-   r-   r?   r.   r     s(   2       
r   c                   @   s$   e Zd ZdZeZdZdZdd ZdS )PerceiverPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    	perceiverrf   c                 C   s2  t |tjtjfr@|jjjd| jjd |j	dk	r>|j	j
  nt|drb|jjjd| jjd nt|drt |tr|jjjd| jjd nt |tjr| D ]}|| jjd| jjd qnlt |tjr|jjjd| jjd |jdk	r.|jj|j 
  n(t |tjr.|j	j
  |jjd dS )zInitialize the weights        )meanZstdNr;   position_embeddings      ?)r   r   rU   Conv2dweightdataZnormal_r>   Zinitializer_rangebiasZzero_hasattrr;   "PerceiverTrainablePositionEncodingr   ParameterDictrs   	EmbeddingZpadding_idxrQ   Zfill_)r=   modulemodalityr-   r-   r.   _init_weightsj  s$    

z&PerceiverPreTrainedModel._init_weightsN)	r&   r'   r(   r)   r   config_classZbase_model_prefixZmain_input_namer   r-   r-   r-   r.   r   `  s
   r   aL  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`PerceiverConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`PerceiverConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
        decoder (*DecoderType*, *optional*):
            Optional decoder to use to decode the latent representation of the encoder. Examples include
            *transformers.models.perceiver.modeling_perceiver.PerceiverBasicDecoder*,
            *transformers.models.perceiver.modeling_perceiver.PerceiverClassificationDecoder*,
            *transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalDecoder*.
        input_preprocessor (*PreprocessorType*, *optional*):
            Optional input preprocessor to use. Examples include
            *transformers.models.perceiver.modeling_perceiver.PerceiverImagePreprocessor*,
            *transformers.models.perceiver.modeling_perceiver.PerceiverAudioPreprocessor*,
            *transformers.models.perceiver.modeling_perceiver.PerceiverTextPreprocessor*,
            *transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalPreprocessor*.
        output_postprocessor (*PostprocessorType*, *optional*):
            Optional output postprocessor to use. Examples include
            *transformers.models.perceiver.modeling_perceiver.PerceiverImagePostprocessor*,
            *transformers.models.perceiver.modeling_perceiver.PerceiverAudioPostprocessor*,
            *transformers.models.perceiver.modeling_perceiver.PerceiverClassificationPostprocessor*,
            *transformers.models.perceiver.modeling_perceiver.PerceiverProjectionPostprocessor*,
            *transformers.models.perceiver.modeling_perceiver.PerceiverMultimodalPostprocessor*.

        Note that you can define your own decoders, preprocessors and/or postprocessors to fit your use-case.
a3  
    Args:
        inputs (`torch.FloatTensor`):
            Inputs to the perceiver. Can be anything: images, text, audio, video, etc.
        attention_mask (`torch.FloatTensor` of shape `{0}`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z:The Perceiver: a scalable, fully attentional architecture.c                       s   e Zd Zdeed fddZdd Zdd Zd	d
 Ze	e
deeeddejeej eeeejf  eej ee ee ee eeef dddZ  ZS )PerceiverModelN)input_preprocessoroutput_postprocessorc                    sX   t  | || _|| _|| _t|| _t||d k	r:|jn|j	d| _
|| _|   d S )N)r\   )r6   r7   r>   r   r   r4   
embeddingsr   num_channelsd_modelencoderdecoder	post_init)r=   r>   r   r   r   r?   r-   r.   r7     s    
 zPerceiverModel.__init__c                 C   s   | j jS r5   r   r;   r=   r-   r-   r.   get_input_embeddings  s    z#PerceiverModel.get_input_embeddingsc                 C   s   || j _d S r5   r   )r=   rX   r-   r-   r.   set_input_embeddings  s    z#PerceiverModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r=   Zheads_to_pruner   r   r-   r-   r.   _prune_heads  s    zPerceiverModel._prune_headsz(batch_size, sequence_length)output_typer   )rf   rd   subsampled_output_pointsre   rh   r   r   ri   c              
   C   s  |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| jdk	rX| |\}}}	n>d}d}	| d | j jkrtd| d  d| j j d| \}
}}|j}|dkrt	j
|
|f|d}| |}| || j j| j j }| j|
d}| j|d||||||d}|d	 }d}| jr|dk	rL|d
 jd	 |d jd	 dd}n|}| jj|||	|d}| j||||d}|j}|r|jdk	r|r|j|j |_n
||j }| jr| j||d}|s|dk	r||f|dd  S |f|dd  S t|||j|j|jdS )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import PerceiverConfig, PerceiverTokenizer, PerceiverImageProcessor, PerceiverModel
        >>> from transformers.models.perceiver.modeling_perceiver import (
        ...     PerceiverTextPreprocessor,
        ...     PerceiverImagePreprocessor,
        ...     PerceiverClassificationDecoder,
        ... )
        >>> import torch
        >>> import requests
        >>> from PIL import Image

        >>> # EXAMPLE 1: using the Perceiver to classify texts
        >>> # - we define a TextPreprocessor, which can be used to embed tokens
        >>> # - we define a ClassificationDecoder, which can be used to decode the
        >>> # final hidden states of the latents to classification logits
        >>> # using trainable position embeddings
        >>> config = PerceiverConfig()
        >>> preprocessor = PerceiverTextPreprocessor(config)
        >>> decoder = PerceiverClassificationDecoder(
        ...     config,
        ...     num_channels=config.d_latents,
        ...     trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
        ...     use_query_residual=True,
        ... )
        >>> model = PerceiverModel(config, input_preprocessor=preprocessor, decoder=decoder)

        >>> # you can then do a forward pass as follows:
        >>> tokenizer = PerceiverTokenizer()
        >>> text = "hello world"
        >>> inputs = tokenizer(text, return_tensors="pt").input_ids

        >>> with torch.no_grad():
        ...     outputs = model(inputs=inputs)
        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 2]

        >>> # to train, one can train the model using standard cross-entropy:
        >>> criterion = torch.nn.CrossEntropyLoss()

        >>> labels = torch.tensor([1])
        >>> loss = criterion(logits, labels)

        >>> # EXAMPLE 2: using the Perceiver to classify images
        >>> # - we define an ImagePreprocessor, which can be used to embed images
        >>> config = PerceiverConfig(image_size=224)
        >>> preprocessor = PerceiverImagePreprocessor(
        ...     config,
        ...     prep_type="conv1x1",
        ...     spatial_downsample=1,
        ...     out_channels=256,
        ...     position_encoding_type="trainable",
        ...     concat_or_add_pos="concat",
        ...     project_pos_dim=256,
        ...     trainable_position_encoding_kwargs=dict(
        ...         num_channels=256,
        ...         index_dims=config.image_size**2,
        ...     ),
        ... )

        >>> model = PerceiverModel(
        ...     config,
        ...     input_preprocessor=preprocessor,
        ...     decoder=PerceiverClassificationDecoder(
        ...         config,
        ...         num_channels=config.d_latents,
        ...         trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
        ...         use_query_residual=True,
        ...     ),
        ... )

        >>> # you can then do a forward pass as follows:
        >>> image_processor = PerceiverImageProcessor()
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = image_processor(image, return_tensors="pt").pixel_values

        >>> with torch.no_grad():
        ...     outputs = model(inputs=inputs)
        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 2]

        >>> # to train, one can train the model using standard cross-entropy:
        >>> criterion = torch.nn.CrossEntropyLoss()

        >>> labels = torch.tensor([1])
        >>> loss = criterion(logits, labels)
        ```NrD   zLast dimension of the inputs: z' doesn't correspond to config.d_model: z0. Make sure to set config.d_model appropriately.devicerA   )rd   re   rf   rg   rh   r   r   r   audioimager   r   r   labelsubsampled_points)z
query_maskrh   )modality_sizes)r!   r"   r#   r$   r%   )r>   rh   r   use_return_dictr   r^   r   rL   r   r*   ZonesZinvert_attention_maskZget_head_maskr   r   r   r   r   ro   decoder_queryr!   r%   r   r    r#   r$   )r=   rf   rd   r   re   rh   r   r   r   inputs_without_posrB   
seq_lengthrv   r   Zextended_attention_maskZembedding_outputZencoder_outputsZsequence_outputr!   Zoutput_modality_sizesr   decoder_outputsr-   r-   r.   rF     s    j



   


zPerceiverModel.forward)NNN)NNNNNN)r&   r'   r(   PreprocessorTypePostprocessorTyper7   r   r   r   r   PERCEIVER_INPUTS_DOCSTRINGformatr   r    _CONFIG_FOR_DOCr*   r+   r
   r   r   rx   ry   r   r   rF   rH   r-   r-   r?   r.   r     s8      
      
r   z6Example use of Perceiver for masked language modeling.c                       s   e Zd Zed fddZeedee	e
dd
eej eej eej ee ee eej ee eej eee	f d	dd	Z  ZS )PerceiverForMaskedLMr>   c                    sh   t  | t|}|j|jd}t||t||j|j|jd|jddd|d
d| _t	|| _
|   d S )Nr   
index_dims      F)	output_num_channelsoutput_index_dimsr   rM   rN   rK   r   final_project"trainable_position_encoding_kwargsr   r   )r6   r7   PerceiverTextPreprocessorr   max_position_embeddingsr   PerceiverBasicDecoderr:   r   PerceiverEmbeddingDecoderembedding_decoderr   )r=   r>   Ztext_preprocessor*trainable_position_encoding_kwargs_decoderr?   r-   r.   r7     s.    
zPerceiverForMaskedLM.__init__batch_size, sequence_lengthr   N	rf   rd   re   rh   r   labelsr   	input_idsri   c	                 C   s   |dk	r|dk	rt dn|dkr.|dk	r.|}|dk	r:|n| jj}| j||||||d}	| j|rf|	jn|	d | jjjd}
d}|dk	rt }||
	d| jj
|	d}|s|
f|	dd  }|dk	r|f| S |S t||
|	j|	j|	jdS )	a3  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, PerceiverForMaskedLM
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("deepmind/language-perceiver")
        >>> model = PerceiverForMaskedLM.from_pretrained("deepmind/language-perceiver")

        >>> # training
        >>> text = "This is an incomplete sentence where some words are missing."
        >>> inputs = tokenizer(text, padding="max_length", return_tensors="pt")
        >>> # mask " missing."
        >>> inputs["input_ids"][0, 52:61] = tokenizer.mask_token_id
        >>> labels = tokenizer(text, padding="max_length", return_tensors="pt").input_ids

        >>> outputs = model(**inputs, labels=labels)
        >>> loss = outputs.loss
        >>> round(loss.item(), 2)
        19.87

        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 2048, 262]

        >>> # inference
        >>> text = "This is an incomplete sentence where some words are missing."
        >>> encoding = tokenizer(text, padding="max_length", return_tensors="pt")

        >>> # mask bytes corresponding to " missing.". Note that the model performs much better if the masked span starts with a space.
        >>> encoding["input_ids"][0, 52:61] = tokenizer.mask_token_id

        >>> # forward pass
        >>> with torch.no_grad():
        ...     outputs = model(**encoding)
        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 2048, 262]

        >>> masked_tokens_predictions = logits[0, 52:61].argmax(dim=-1).tolist()
        >>> tokenizer.decode(masked_tokens_predictions)
        ' missing.'
        ```N,You cannot use both `inputs` and `input_ids`rf   rd   re   rh   r   r   r   )embedding_layerrD   r]   r1   r!   r#   r$   r%   )rL   r>   r   r   r   r!   r   r   r   r_   
vocab_sizer0   r#   r$   r%   )r=   rf   rd   re   rh   r   r   r   r   rw   r!   Zmasked_lm_lossloss_fctr   r-   r-   r.   rF     s>    @
	 zPerceiverForMaskedLM.forward)NNNNNNNN)r&   r'   r(   r   r7   r   r   r   r   r0   r   r
   r*   rx   ry   r   r   rF   rH   r-   r-   r?   r.   r     s,   
        
r   z1Example use of Perceiver for text classification.c                       s   e Zd Z fddZeedeee	dd	e
ej e
ej e
ej e
e e
e e
ej e
e e
ej eeef d	ddZ  ZS )
"PerceiverForSequenceClassificationc              	      sN   t  | |jdd}|j| _t|t|t||j|ddd| _|   d S )Nr   r   Tr   r   r   r   )	r6   r7   r:   
num_labelsr   r   PerceiverClassificationDecoderr   r   )r=   r>   r   r?   r-   r.   r7   B  s    z+PerceiverForSequenceClassification.__init__r   r   Nr   c	                 C   s  |dk	r|dk	rt dn|dkr.|dk	r.|}|dk	r:|n| jj}| j||||||d}	|rb|	jn|	d }
d}|dk	rX| jjdkr| jdkrd| j_n4| jdkr|jtj	ks|jtj
krd| j_nd| j_| jjdkr
t }| jdkr||
 | }n
||
|}nN| jjdkr:t }||
d	| j|d	}n| jjdkrXt }||
|}|s|
f|	d
d  }|dk	r|f| S |S t||
|	j|	j|	jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the classification/regression loss. Indices should be in `[0, ..., config.num_labels -
            1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels >
            1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, PerceiverForSequenceClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("deepmind/language-perceiver")
        >>> model = PerceiverForSequenceClassification.from_pretrained("deepmind/language-perceiver")

        >>> text = "hello world"
        >>> inputs = tokenizer(text, return_tensors="pt").input_ids
        >>> outputs = model(inputs=inputs)
        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 2]
        ```Nr   r  r   r   
regressionsingle_label_classificationmulti_label_classificationrD   r]   r  rL   r>   r   r   r!   Zproblem_typer  dtyper*   longrG   r   squeezer   r_   r   r3   r#   r$   r%   )r=   rf   rd   re   rh   r   r   r   r   rw   r!   r1   r  r   r-   r-   r.   rF   V  sV    $
	


"


z*PerceiverForSequenceClassification.forward)NNNNNNNNr&   r'   r(   r7   r   r   r   r   r3   r   r
   r*   rx   ry   r   r   rF   rH   r-   r-   r?   r.   r  @  s,   
        
r  a  
Example use of Perceiver for image classification, for tasks such as ImageNet.

This model uses learned position embeddings. In other words, this model is not given any privileged information about
the structure of images. As shown in the paper, this model can achieve a top-1 accuracy of 72.7 on ImageNet.

[`PerceiverForImageClassificationLearned`] uses [`~models.perceiver.modeling_perceiver.PerceiverImagePreprocessor`]
(with `prep_type="conv1x1"`) to preprocess the input images, and
[`~models.perceiver.modeling_perceiver.PerceiverClassificationDecoder`] to decode the latent representation of
[`PerceiverModel`] into classification logits.
c                       s   e Zd Z fddZeedeee	dd	e
ej e
ej e
ej e
e e
e e
ej e
e e
ej eeef d	ddZ  ZS )
&PerceiverForImageClassificationLearnedc                    sn   t  | d|jd d}|jdd}|j| _t|t|dddddd|dt||j|d	d
d| _| 	  d S )Nr   r]   r   r   conv1x1	trainableconcat)	prep_typespatial_downsampleout_channelsposition_encoding_typeconcat_or_add_posproject_pos_dimr   Tr  r   )
r6   r7   
image_sizer:   r  r   PerceiverImagePreprocessorr	  r   r   )r=   r>   Z/trainable_position_encoding_kwargs_preprocessorr   r?   r-   r.   r7     s0    
z/PerceiverForImageClassificationLearned.__init__r   r   N	rf   rd   re   rh   r   r   r   pixel_valuesri   c	                 C   s  |dk	r|dk	rt dn|dkr.|dk	r.|}|dk	r:|n| jj}| j||||||d}	|rb|	jn|	d }
d}|dk	rX| jjdkr| jdkrd| j_n4| jdkr|jtj	ks|jtj
krd| j_nd| j_| jjdkr
t }| jdkr||
 | }n
||
|}nN| jjdkr:t }||
d	| j|d	}n| jjdkrXt }||
|}|s|
f|	d
d  }|dk	r|f| S |S t||
|	j|	j|	jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, PerceiverForImageClassificationLearned
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("deepmind/vision-perceiver-learned")
        >>> model = PerceiverForImageClassificationLearned.from_pretrained("deepmind/vision-perceiver-learned")

        >>> inputs = image_processor(images=image, return_tensors="pt").pixel_values
        >>> outputs = model(inputs=inputs)
        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 1000]

        >>> # model predicts one of the 1000 ImageNet classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: tabby, tabby cat
        ```N/You cannot use both `inputs` and `pixel_values`r  r   r   r
  r  r  rD   r]   r  r  r=   rf   rd   re   rh   r   r   r   r  rw   r!   r1   r  r   r-   r-   r.   rF     sV    -



"


z.PerceiverForImageClassificationLearned.forward)NNNNNNNNr  r-   r-   r?   r.   r    s,   
        
r  am  
Example use of Perceiver for image classification, for tasks such as ImageNet.

This model uses fixed 2D Fourier position embeddings. As shown in the paper, this model can achieve a top-1 accuracy of
79.0 on ImageNet, and 84.5 when pre-trained on a large-scale dataset (i.e. JFT).

[`PerceiverForImageClassificationLearned`] uses [`~models.perceiver.modeling_perceiver.PerceiverImagePreprocessor`]
(with `prep_type="pixels"`) to preprocess the input images, and
[`~models.perceiver.modeling_perceiver.PerceiverClassificationDecoder`] to decode the latent representation of
[`PerceiverModel`] into classification logits.
c                       s   e Zd Z fddZeedeee	dd	e
ej e
ej e
ej e
e e
e e
ej e
e e
ej eeef d	ddZ  ZS )
&PerceiverForImageClassificationFourierc              	      sd   t  | ddddd}|jdd}|j| _t|t|dd|d	t||j|dd
d| _|   d S )NT   r$  @   F
concat_posmax_resolution	num_bands	sine_onlyr   r   pixels)r  r   fourier_position_encoding_kwargsr  r   	r6   r7   r:   r  r   r  r	  r   r   r=   r>   -fourier_position_encoding_kwargs_preprocessorr   r?   r-   r.   r7   N  s0    z/PerceiverForImageClassificationFourier.__init__r   r   Nr  c	                 C   s  |dk	r|dk	rt dn|dkr.|dk	r.|}|dk	r:|n| jj}| j||||||d}	|rb|	jn|	d }
d}|dk	rX| jjdkr| jdkrd| j_n4| jdkr|jtj	ks|jtj
krd| j_nd| j_| jjdkr
t }| jdkr||
 | }n
||
|}nN| jjdkr:t }||
d	| j|d	}n| jjdkrXt }||
|}|s|
f|	d
d  }|dk	r|f| S |S t||
|	j|	j|	jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, PerceiverForImageClassificationFourier
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("deepmind/vision-perceiver-fourier")
        >>> model = PerceiverForImageClassificationFourier.from_pretrained("deepmind/vision-perceiver-fourier")

        >>> inputs = image_processor(images=image, return_tensors="pt").pixel_values
        >>> outputs = model(inputs=inputs)
        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 1000]

        >>> # model predicts one of the 1000 ImageNet classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: tabby, tabby cat
        ```Nr   r  r   r   r
  r  r  rD   r]   r  r  r!  r-   r-   r.   rF   m  sV    -



"


z.PerceiverForImageClassificationFourier.forward)NNNNNNNNr  r-   r-   r?   r.   r"  ?  s,   
        
r"  a/  
Example use of Perceiver for image classification, for tasks such as ImageNet.

This model uses a 2D conv+maxpool preprocessing network. As shown in the paper, this model can achieve a top-1 accuracy
of 82.1 on ImageNet.

[`PerceiverForImageClassificationLearned`] uses [`~models.perceiver.modeling_perceiver.PerceiverImagePreprocessor`]
(with `prep_type="conv"`) to preprocess the input images, and
[`~models.perceiver.modeling_perceiver.PerceiverClassificationDecoder`] to decode the latent representation of
[`PerceiverModel`] into classification logits.
c                       s   e Zd Z fddZeedeee	dd	e
ej e
ej e
ej e
e e
e e
ej e
e e
ej eeef d	ddZ  ZS )
-PerceiverForImageClassificationConvProcessingc              	      sf   t  | ddddd}|jdd}|j| _t|t|ddd	|d
t||j|ddd| _|   d S )NT)8   r1  r%  Fr&  r   r   convfourier)r  r  r  r,  r  r   r-  r.  r?   r-   r.   r7     s2    z6PerceiverForImageClassificationConvProcessing.__init__r   r   Nr  c	                 C   s  |dk	r|dk	rt dn|dkr.|dk	r.|}|dk	r:|n| jj}| j||||||d}	|rb|	jn|	d }
d}|dk	rX| jjdkr| jdkrd| j_n4| jdkr|jtj	ks|jtj
krd| j_nd| j_| jjdkr
t }| jdkr||
 | }n
||
|}nN| jjdkr:t }||
d	| j|d	}n| jjdkrXt }||
|}|s|
f|	d
d  }|dk	r|f| S |S t||
|	j|	j|	jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, PerceiverForImageClassificationConvProcessing
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("deepmind/vision-perceiver-conv")
        >>> model = PerceiverForImageClassificationConvProcessing.from_pretrained("deepmind/vision-perceiver-conv")

        >>> inputs = image_processor(images=image, return_tensors="pt").pixel_values
        >>> outputs = model(inputs=inputs)
        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 1000]

        >>> # model predicts one of the 1000 ImageNet classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: tabby, tabby cat
        ```Nr   r  r   r   r
  r  r  rD   r]   r  r  r!  r-   r-   r.   rF     sV    -



"


z5PerceiverForImageClassificationConvProcessing.forward)NNNNNNNNr  r-   r-   r?   r.   r0    s,    
        
r0  a  
Example use of Perceiver for optical flow, for tasks such as Sintel and KITTI. [`PerceiverForOpticalFlow`] uses
[`~models.perceiver.modeling_perceiver.PerceiverImagePreprocessor`] (with *prep_type="patches"*) to preprocess the
input images, and [`~models.perceiver.modeling_perceiver.PerceiverOpticalFlowDecoder`] to decode the latent
representation of [`PerceiverModel`].

As input, one concatenates 2 subsequent frames along the channel dimension and extract a 3 x 3 patch around each pixel
(leading to 3 x 3 x 3 x 2 = 54 values for each pixel). Fixed Fourier position encodings are used to encode the position
of each pixel in the patch. Next, one applies the Perceiver encoder. To decode, one queries the latent representation
using the same encoding used for the input.
c                       s   e Zd Z fddZeedeee	dd	e
ej e
ej e
ej e
e e
e e
ej e
e eeef dddZ  ZS )
PerceiverForOpticalFlowc                    sx   t  | d|jddd}d|jddd}t|ddddd	d
|d}t||t||j|jddd	d
|dd| _|   d S )Nr%  FTr)  r(  r*  r'  r&  patchesr   6   r]   r3  )r  r  conv_after_patchingconv_after_patching_in_channelstemporal_downsampler  r,        Y@)r   output_image_shaperescale_factorr   r   r  r,  r   )	r6   r7   Z
train_sizer  r   PerceiverOpticalFlowDecoderr   r   r   )r=   r>   r/  Z(fourier_position_encoding_kwargs_decoderZimage_preprocessorr?   r-   r.   r7   m  sH    z PerceiverForOpticalFlow.__init__r   r   N)rf   rd   re   rh   r   r   r   ri   c                 C   s   |dk	r|n| j j}| j||||||d}|r4|jn|d }	d}
|dk	rPtd|s||	f|dd  }|
dk	rx|
f| S |S t|
|	|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the optical flow loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Returns:

        Examples:

        ```python
        >>> from transformers import PerceiverForOpticalFlow
        >>> import torch

        >>> model = PerceiverForOpticalFlow.from_pretrained("deepmind/optical-flow-perceiver")

        >>> # in the Perceiver IO paper, the authors extract a 3 x 3 patch around each pixel,
        >>> # leading to 3 x 3 x 3 = 27 values for each pixel (as each pixel also has 3 color channels)
        >>> # patches have shape (batch_size, num_frames, num_channels, height, width)
        >>> # the authors train on resolutions of 368 x 496
        >>> patches = torch.randn(1, 2, 27, 368, 496)
        >>> outputs = model(inputs=patches)
        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 368, 496, 2]
        ```Nr  r   z*Optical flow training is not yet supportedr]   r  	r>   r   r   r!   NotImplementedErrorr3   r#   r$   r%   )r=   rf   rd   re   rh   r   r   r   rw   r!   r1   r   r-   r-   r.   rF     s.    $zPerceiverForOpticalFlow.forward)NNNNNNNr  r-   r-   r?   r.   r4  ^  s(   1
       
r4  a  
Example use of Perceiver for multimodal (video) autoencoding, for tasks such as Kinetics-700.

[`PerceiverForMultimodalAutoencoding`] uses [`~models.perceiver.modeling_perceiver.PerceiverMultimodalPreprocessor`] to
preprocess the 3 modalities: images, audio and class labels. This preprocessor uses modality-specific preprocessors to
preprocess every modality separately, after which they are concatenated. Trainable position embeddings are used to pad
each modality to the same number of channels to make concatenation along the time dimension possible. Next, one applies
the Perceiver encoder.

[`~models.perceiver.modeling_perceiver.PerceiverMultimodalDecoder`] is used to decode the latent representation of
[`PerceiverModel`]. This decoder uses each modality-specific decoder to construct queries. The decoder queries are
created based on the inputs after preprocessing. However, autoencoding an entire video in a single forward pass is
computationally infeasible, hence one only uses parts of the decoder queries to do cross-attention with the latent
representation. This is determined by the subsampled indices for each modality, which can be provided as additional
input to the forward pass of [`PerceiverForMultimodalAutoencoding`].

[`~models.perceiver.modeling_perceiver.PerceiverMultimodalDecoder`] also pads the decoder queries of the different
modalities to the same number of channels, in order to concatenate them along the time dimension. Next, cross-attention
is performed with the latent representation of [`PerceiverModel`].

Finally, [`~models.perceiver.modeling_perceiver.PerceiverMultiModalPostprocessor`] is used to turn this tensor into an
actual video. It first splits up the output into the different modalities, and then applies the respective
postprocessor for each modality.

Note that, by masking the classification label during evaluation (i.e. simply providing a tensor of zeros for the
"label" modality), this auto-encoding model becomes a Kinetics 700 video classifier.
c                       s   e Zd Zed fddZeedee	e
dd
eej eej eeeejf  eej ee ee eej ee eee	f d	dd	Z  ZS )"PerceiverForMultimodalAutoencodingr   c                    s\  t  | |j|j }tdt|dd|fdddd|jdt|dd	|j|j|jfdddddd
dt	|dddddd}t
|d|j|jdddd	|j|j|jfdddd}t|dt|d||j f|jdddd|fdddd|t|dddd|jd
dddd |jdd}tt||jdt|jddt||jddd}t||||d| _|   d S )Nr   r3     FTr5  r6  )r  r,  r  samples_per_patch    r   )r  r,  r  r  r:  r   r   r   )r   r   r   )min_padding_size
modalities
mask_probs)concat_preprocessed_inputoutput_shaper   r   position_encoding_onlyr  r,  )rH  r   r   r   rJ  r  r,  r  r   )rH  r   rJ  r  r   )rH  rF  num_outputsr   r   )in_channelsr   )rL  r  )rF  )r   r   r   )r6   r7   Z
num_framesZaudio_samples_per_framePerceiverMultimodalPreprocessorPerceiverAudioPreprocessorrC  r  r  PerceiverOneHotPreprocessor&PerceiverBasicVideoAutoencodingDecoderrI  r   PerceiverMultimodalDecoderr   r	  Z_label_trainable_num_channels PerceiverMultimodalPostprocessorPerceiverAudioPostprocessor PerceiverProjectionPostprocessor$PerceiverClassificationPostprocessorr   r   r   )r=   r>   Zn_audio_samplesr   Zimage_decoderr   r   r?   r-   r.   r7     s    
!
*z+PerceiverForMultimodalAutoencoding.__init__r   r   N)	rf   rd   r   re   rh   r   r   r   ri   c	              	   C   s   |dk	r|n| j j}| j|||||||d}	|r6|	jn|	d }
d}|dk	rRtd|s~|
f|	dd  }|dk	rz|f| S |S t||
|	j|	j|	jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import PerceiverForMultimodalAutoencoding
        >>> import torch
        >>> import numpy as np

        >>> # create multimodal inputs
        >>> images = torch.randn((1, 16, 3, 224, 224))
        >>> audio = torch.randn((1, 30720, 1))
        >>> inputs = dict(image=images, audio=audio, label=torch.zeros((images.shape[0], 700)))

        >>> model = PerceiverForMultimodalAutoencoding.from_pretrained("deepmind/multimodal-perceiver")

        >>> # in the Perceiver IO paper, videos are auto-encoded in chunks
        >>> # each chunk subsamples different index dimensions of the image and audio modality decoder queries
        >>> nchunks = 128
        >>> image_chunk_size = np.prod((16, 224, 224)) // nchunks
        >>> audio_chunk_size = audio.shape[1] // model.config.samples_per_patch // nchunks
        >>> # process the first chunk
        >>> chunk_idx = 0
        >>> subsampling = {
        ...     "image": torch.arange(image_chunk_size * chunk_idx, image_chunk_size * (chunk_idx + 1)),
        ...     "audio": torch.arange(audio_chunk_size * chunk_idx, audio_chunk_size * (chunk_idx + 1)),
        ...     "label": None,
        ... }

        >>> outputs = model(inputs=inputs, subsampled_output_points=subsampling)
        >>> logits = outputs.logits
        >>> list(logits["audio"].shape)
        [1, 240]

        >>> list(logits["image"].shape)
        [1, 6272, 3]

        >>> list(logits["label"].shape)
        [1, 700]
        ```N)rf   rd   r   re   rh   r   r   r   z5Multimodal autoencoding training is not yet supportedr]   r  r?  )r=   rf   rd   r   re   rh   r   r   r   rw   r!   r1   r   r-   r-   r.   rF   q  s0    ;	z*PerceiverForMultimodalAutoencoding.forward)NNNNNNNN)r&   r'   r(   r   r7   r   r   r   r   r3   r   r
   r*   rx   r   r   ry   r   r   rF   rH   r-   r-   r?   r.   rA    s,   s
        
rA  rD   c                 C   st   | dkr |st dtf |}n0| dkr@|s4t dtf |}nt d|  d|dkrdt||nt }||fS )z
    Builds the position encoding.

    Args:
    - out_channels: refers to the number of channels of the position encodings.
    - project_pos_dim: if specified, will project the position encodings to this dimension.

    r  z4Make sure to pass trainable_position_encoding_kwargsr3  z2Make sure to pass fourier_position_encoding_kwargsz Unknown position encoding type: .r   )rL   r    PerceiverFourierPositionEncodingr   rU   rS   )r  r  r  r   r,  Zoutput_pos_encpositions_projectionr-   r-   r.   build_position_encoding  s    rY  c                   @   sB   e Zd ZdZejd	ddZeejdd Zejd
ddZ	dS )PerceiverAbstractDecoderzPerceiver abstract decoder.Nc                 C   s   t d S r5   r@  r=   rf   r   r   r   r-   r-   r.   r     s    z&PerceiverAbstractDecoder.decoder_queryc                 C   s   t d S r5   r[  r   r-   r-   r.   num_query_channels  s    z+PerceiverAbstractDecoder.num_query_channelsc                 C   s   t d S r5   r[  )r=   rV   r   r   r-   r-   r.   rF     s    z PerceiverAbstractDecoder.forward)NNN)N)
r&   r'   r(   r)   abcabstractmethodr   propertyr]  rF   r-   r-   r-   r.   rZ    s   rZ  )	metaclassc                       sL   e Zd ZdZ fddZd
ddZdejeje	ej ejddd	Z
  ZS )PerceiverProjectionDecoderz
    Baseline projection decoder (no cross-attention).

    Args:
        config ([`PerceiverConfig`]):
            Model configuration.
    c                    s    t    t|j|j| _d S r5   )r6   r7   r   rU   r:   r  
classifierr<   r?   r-   r.   r7     s    
z#PerceiverProjectionDecoder.__init__Nc                 C   s   d S r5   r-   r\  r-   r-   r.   r     s    z(PerceiverProjectionDecoder.decoder_queryrV   r   r   ri   c                 C   s   t j|dd}| |}|S )Nr   rk   )r*   r   rc  )r=   rV   r   r   r!   r-   r-   r.   rF     s    
z"PerceiverProjectionDecoder.forward)NNN)N)r&   r'   r(   r)   r7   r   r*   rx   r+   r
   rF   rH   r-   r-   r?   r.   rb    s   
   rb  c                       s   e Zd ZdZdeeee ee ee ee ee ee ee ee ee ee ee ee dd fd	d
Z	e
edddZdddZdejejeej ee edddZ  ZS )r   a  
    Cross-attention-based decoder. This class can be used to decode the final hidden states of the latents using a
    cross-attention operation, in which the latents produce keys and values.

    The shape of the output of this class depends on how one defines the output queries (also called decoder queries).

    Args:
        config ([*PerceiverConfig*]):
            Model configuration.
        output_num_channels (`int`, *optional*):
            The number of channels in the output. Will only be used in case *final_project* is set to `True`.
        position_encoding_type (`str`, *optional*, defaults to "trainable"):
            The type of position encoding to use. Can be either "trainable", "fourier", or "none".
        output_index_dims (`int`, *optional*):
            The number of dimensions of the output queries. Ignored if 'position_encoding_type' == 'none'.
        num_channels (`int`, *optional*, defaults to 128):
            The number of channels of the decoder queries. Ignored if 'position_encoding_type' == 'none'.
        qk_channels (`int`, *optional*):
            The number of channels of the queries and keys in the cross-attention layer.
        v_channels (`int`, *optional*):
            The number of channels of the values in the cross-attention layer.
        num_heads (`int`, *optional*, defaults to 1):
            The number of attention heads in the cross-attention layer.
        widening_factor (`int`, *optional*, defaults to 1):
            The widening factor of the cross-attention layer.
        use_query_residual (`bool`, *optional*, defaults to `False`):
            Whether to use a residual connection between the query and the output of the cross-attention layer.
        concat_preprocessed_input (`bool`, *optional*, defaults to `False`):
            Whether to concatenate the preprocessed input to the query.
        final_project (`bool`, *optional*, defaults to `True`):
            Whether to project the output of the cross-attention layer to a target dimension.
        position_encoding_only (`bool`, *optional*, defaults to `False`):
            Whether to only use this class to define output queries.
    r  N   r   FT)r>   r   r  r   r   subsampled_index_dimsrM   rN   rK   r   r   rH  r   rJ  ri   c                    s   t    || _d | _|| _|| _|dkrDtf d|i|\| _| _|| _|| _	|d kr\|}|| _
|| _|| _|| _| jst|d|||	||j|
|d	| _|rt||nt | _d S )Nnoner  Tr   )r6   r7   r   output_position_encodingsr  position_encoding_kwargsrY  rX  r   r   rf  rH  r   rJ  r   r:   decoding_cross_attentionr   rU   rS   final_layer)r=   r>   r   r  r   r   rf  rM   rN   rK   r   r   rH  r   rJ  ri  r?   r-   r.   r7   A  s@    
zPerceiverBasicDecoder.__init__ri   c                 C   sH   | j dkrtd| jr6d| jkr,| jd S | j S | jrB| jS | jS )Nrg  z`You cannot calculate number of decoder query channels when position_encoding_type is set to noner  )	r  rL   rJ  ri  rh  output_sizer   r   r   r   r-   r-   r.   r]  z  s    



z(PerceiverBasicDecoder.num_query_channelsc           
      C   s  | j dkrtd|d k	rdd t| | jD }tj|dd}|jd }dd	| t	| jd d d f   }t
|d  ||jd |jd g}| j d
kr| |}n$| j dkr| j| j||j|j|d}| |}t||jd d|jd g}n\|jd }|jd	d  }	| j d
kr(| |}n"| j dkrJ| j|	||j|jd}| |}| jr|d krntdtj||gdd}|S )Nrg  zOYou cannot construct decoder queries when position_encoding_type is set to nonec                 S   s   g | ]}t |qS r-   )r*   Z
from_numpy)r   ra   r-   r-   r.   
<listcomp>  s     z7PerceiverBasicDecoder.decoder_query.<locals>.<listcomp>r   rk   r   rD   r]   r  r3  )rB   r   r  posr   r  zMValue is required for inputs_without_pos if concat_preprocessed_input is True)r  rL   npZunravel_indexcpur   r*   stackro   Ztensorbroadcast_torh  r   r  rX  reshaperH  cat)
r=   rf   r   r   r   indicesro  rB   Zpos_embr   r-   r-   r.   r     sJ    

$"

    
 
   

z#PerceiverBasicDecoder.decoder_queryrV   r   r   rh   ri   c           	      C   sR   |rdnd }| j ||d |d |d}|d }|r<||d f }| |}t||dS )Nr-   r   r   r   r!   r%   )rj  rk  r/   )	r=   rV   r   r   rh   r%   r   r   r!   r-   r-   r.   rF     s    
zPerceiverBasicDecoder.forward)r  Nre  NNNr   r   FFTF)NNN)NF)r&   r'   r(   r)   r   rG   r
   r   ry   r7   r`  r]  r   r*   rx   r+   r/   rF   rH   r-   r-   r?   r.   r     sT   '            9
4  r   c                       sb   e Zd ZdZ fddZeedddZddd	Zde	j
e	jee	j ee edddZ  ZS )r	  a  
    Cross-attention based classification decoder. Light-weight wrapper of [`PerceiverBasicDecoder`] for logit output.
    Will turn the output of the Perceiver encoder which is of shape (batch_size, num_latents, d_latents) to a tensor of
    shape (batch_size, num_labels). The queries are of shape (batch_size, 1, num_labels).

    Args:
        config ([`PerceiverConfig`]):
            Model configuration.
    c                    s0   t    |j| _t|f| jdd|| _d S )Nr   )r   r   )r6   r7   r  r   r   )r=   r>   decoder_kwargsr?   r-   r.   r7     s    
z'PerceiverClassificationDecoder.__init__rl  c                 C   s   | j jS r5   r   r]  r   r-   r-   r.   r]    s    z1PerceiverClassificationDecoder.num_query_channelsNc                 C   s   | j j||||dS )Nr   r   r   r\  r-   r-   r.   r     s       z,PerceiverClassificationDecoder.decoder_queryFrx  c                 C   s6   | j |||d}|jd d dd d f }t||jdS )Nrh   r   ry  )r   r!   r/   r%   )r=   rV   r   r   rh   r   r!   r-   r-   r.   rF     s    z&PerceiverClassificationDecoder.forward)NNN)NFr&   r'   r(   r)   r7   r`  rG   r]  r   r*   rx   r+   r
   ry   r/   rF   rH   r-   r-   r?   r.   r	    s   

	  r	  c                       sd   e Zd ZdZd fdd	ZeedddZdd
dZde	j
e	jee	j ee edddZ  ZS )r>  z+Cross-attention based optical flow decoder.r]   r;  c                    s6   t    || _|| _|| _t|fd|i|| _d S )Nr   )r6   r7   r<  r   r=  r   r   )r=   r>   r<  r   r=  rz  r?   r-   r.   r7   	  s
    
z$PerceiverOpticalFlowDecoder.__init__rl  c                 C   s   | j jS r5   r{  r   r-   r-   r.   r]  	  s    z.PerceiverOpticalFlowDecoder.num_query_channelsNc                 C   s   |d k	rt d|S )Nz,FlowDecoder doesn't support subsampling yet.)rL   r\  r-   r-   r.   r   	  s    z)PerceiverOpticalFlowDecoder.decoder_queryFrx  c                 C   sV   | j |||d}|j}|| j }||jd gt| j |jd g }t||jdS )Nr}  r   rD   ry  )	r   r!   r=  ru  ro   listr<  r/   r%   )r=   rV   r   r   rh   r   predsr-   r-   r.   rF   	  s
    
(z#PerceiverOpticalFlowDecoder.forward)r]   r;  )NNN)NFr~  r-   r-   r?   r.   r>  	  s   
	  r>  c                       sl   e Zd ZdZeee edd fddZe	edddZ
dd	d
Zdejejeej edddZ  ZS )rP  a  
    Cross-attention based video-autoencoding decoder. Light-weight wrapper of [*PerceiverBasicDecoder*] with video
    reshaping logic.

    Args:
        config ([*PerceiverConfig*]):
            Model configuration.
        output_shape (`List[int]`):
            Shape of the output as (batch_size, num_frames, height, width), excluding the channel dimension.
        position_encoding_type (`str`):
            The type of position encoding to use. Can be either "trainable", "fourier", or "none".
    N)r>   rI  r  ri   c                    s\   t    t|dkr&td| d|| _|d | _t|f| jdd |d|| _d S )Nr   z"Expected rank 4 output_shape, got rV  r   r   )r   r  )r6   r7   r   rL   rI  r   r   r   )r=   r>   rI  r  rz  r?   r-   r.   r7   6	  s    

z/PerceiverBasicVideoAutoencodingDecoder.__init__rl  c                 C   s   | j jS r5   r{  r   r-   r-   r.   r]  G	  s    z9PerceiverBasicVideoAutoencodingDecoder.num_query_channelsc                 C   s   | j j||||dS )N)r   r   r   r|  r\  r-   r-   r.   r   K	  s    z4PerceiverBasicVideoAutoencodingDecoder.decoder_queryrd  c                 C   s:   |  ||}|j}t|| j|jd g }t||jdS )NrD   ry  )r   r!   r*   ru  rI  ro   r/   r%   )r=   rV   r   r   r   r!   r-   r-   r.   rF   S	  s    z.PerceiverBasicVideoAutoencodingDecoder.forward)NNN)N)r&   r'   r(   r)   r   r   rG   r   r7   r`  r]  r   r*   rx   r+   r
   r/   rF   rH   r-   r-   r?   r.   rP  (	  s      
	   rP  )r   rf   ri   c                 C   sN   i }d}t |  D ]4}| | }|dd||| f }||7 }|||< q|S )a  
    Partitions a [B, N, C] tensor into tensors for each modality.

    Args:
        modality_sizes
            dict specifying the size of the modality
        inputs:
            input tensor

    Returns:
        dict mapping name of modality to its associated tensor.
    r   N)sortedrs   )r   rf   rw   r   r   r^   Zinpr-   r-   r.   restructure]	  s    
r  c                
       s   e Zd ZdZdeeeef eee	e e	eeef  dd fddZ
eeddd	Zdd
dZdejeje	ej e	e ejdddZ  ZS )rQ  a1  
    Multimodal decoding by composing uni-modal decoders. The *modalities* argument of the constructor is a dictionary
    mapping modality name to the decoder of that modality. That decoder will be used to construct queries for that
    modality. Modality-specific queries are padded with trainable modality-specific parameters, after which they are
    concatenated along the time dimension.

    Next, there is a shared cross attention operation across all modalities.

    Args:
        config ([*PerceiverConfig*]):
            Model configuration.
        modalities (`Dict[str, PerceiverAbstractDecoder]`):
            Dictionary mapping modality name to the decoder of that modality.
        num_outputs (`int`):
            The number of outputs of the decoder.
        output_num_channels (`int`):
            The number of channels in the output.
        min_padding_size (`int`, *optional*, defaults to 2):
            The minimum padding size for all modalities. The final output will have num_channels equal to the maximum
            channels across all modalities plus min_padding_size.
        subsampled_index_dims (`Dict[str, PerceiverAbstractDecoder]`, *optional*):
            Dictionary mapping modality name to the subsampled index dimensions to use for the decoder query of that
            modality.
    r]   N)r>   rF  rK  r   rE  rf  ri   c                    sp   t    t| _| _| _| _| _t	|f|f|d j
d| _t fdd| D  _d S )Nrg  )r   r   r  r   c              
      s,   i | ]$\}}|t td  j|j qS r   )r   r8   r*   r9   r]  )r   r   r   r   r-   r.   
<dictcomp>	  s    z7PerceiverMultimodalDecoder.__init__.<locals>.<dictcomp>)r6   r7   r   
ModuleDictrF  rf  rE  r   rK  r   r]  r   r   r   padding)r=   r>   rF  rK  r   rE  rf  rz  r?   r   r.   r7   	  s*    


z#PerceiverMultimodalDecoder.__init__rl  c                 C   s&   t dd | j D }|| j }|S )Nc                 s   s   | ]\}}|j V  qd S r5   )r]  )r   rv   r   r-   r-   r.   r   	  s     z@PerceiverMultimodalDecoder.num_query_channels.<locals>.<genexpr>maxrF  r   rE  r=   Zmax_channel_sizeZcommon_channel_sizer-   r-   r.   r]  	  s    
z-PerceiverMultimodalDecoder.num_query_channelsc           	   	      s   t ||}|pi }i  j D ]F\}}d }|d k	r@||d }|j|| d |||d d}| |< q fddtj fddtj D ddS )N)rf   r   r   r   c              	      st   t ||jd t|jdd |jd g} j|  }t ||jd |jd  j|jd  g}t j||gddS )Nr   r   rD   r]   rk   )	r*   ru  ro   rq  prodr  rt  r]  rv  )r   ra   ro  r   r-   r.   embed	  s    .
*z7PerceiverMultimodalDecoder.decoder_query.<locals>.embedc                    s   g | ]}| | qS r-   r-   )r   r   )decoder_queriesr  r-   r.   rn  	  s     z<PerceiverMultimodalDecoder.decoder_query.<locals>.<listcomp>r   rk   )	r  rF  r   getr   r*   rv  r  rs   )	r=   rf   r   r   r   r   r   Zinput_without_posrV   r-   )r  r  r=   r.   r   	  s&    


 z(PerceiverMultimodalDecoder.decoder_queryFrx  c                 C   s   | j |||d}|S )Nr}  )r   )r=   rV   r   r   rh   r   r-   r-   r.   rF   	  s    z"PerceiverMultimodalDecoder.forward)r]   N)NN)NF)r&   r'   r(   r)   r   r   r   rZ  rG   r
   r7   r`  r]  r   r*   rx   r+   ry   rF   rH   r-   r-   r?   r.   rQ  u	  s0     

&  rQ  )framestemporal_block_sizespatial_block_sizeri   c              
   C   s  t | jdkrt| j\}}}}| |||| ||| |} | dddddd } | ||| || |d | } | S t | jdkr| j\}}}}}| ||| |||| ||| |} | dddddddd } | ||| || || ||d  | } | S td	d
S )a/  
    Space to depth transform. Rearranges blocks of spatial data, into depth.

    This function assumes the channels to be first, but will place the channels last after transformation.

    Based on https://discuss.pytorch.org/t/is-there-any-layer-like-tensorflows-space-to-depth-function/3487/15.
    r   r   r]   r      r         zlFrames should be of rank 4 (batch, channels, height, width) or rank 5 (batch, time, channels, height, width)N)r   ro   r_   r`   rr   rL   )r  r  r  rB   r   heightwidthtimer-   r-   r.   space_to_depth	  sT    	
r  c                       s(   e Zd ZdZ fddZdd Z  ZS )Conv2dSamePaddingz
    Conv2d layer with padding="same" support. Source:
    https://gist.github.com/sumanmichael/4de9dee93f972d47c80c4ade8e149ea6
    c              	      s>   t t| j|| tttdd | jd d d D | _d S )Nc                 S   s0   g | ](}|d  |d |d     d |d  fqS )r]   r   r-   r   kr-   r-   r.   rn  )
  s     z.Conv2dSamePadding.__init__.<locals>.<listcomp>rD   )	r6   r  r7   r   Z	ZeroPad2dr   r   kernel_sizezero_pad_2dr=   argskwargsr?   r-   r.   r7   &
  s    zConv2dSamePadding.__init__c                 C   s   |  | || j| jS r5   )Z_conv_forwardr  r   r   )r=   inputr-   r-   r.   rF   ,
  s    zConv2dSamePadding.forward)r&   r'   r(   r)   r7   rF   rH   r-   r-   r?   r.   r   
  s   r  c                       sB   e Zd ZdZdeeeed fddZejejd	d
dZ	  Z
S )Conv2DDownsamplezBDownsamples 4x by applying a 2D convolution and doing max pooling.r   r   r%  T)
num_layersrL  r  use_batchnormc                    sV   t    t||dddd| _|r.tj|dnt | _t | _	tj
ddd| _dS )	a}  
        Constructs a Conv2DDownsample model.

        Args:
          in_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
          out_channels (`int`, *optional*, defaults to 64):
            The number of conv output channels.
          use_batchnorm (`bool`, *optional*, defaults to `True`):
            Whether to use batchnorm.
        r  r]   F)rL  r  r  strider   )Znum_featuresr   )r  r  N)r6   r7   r  r2  r   ZBatchNorm2drS   	batchnormZReLUreluZ	MaxPool2dmax_pool)r=   r  rL  r  r  r?   r-   r.   r7   3
  s    
    
zConv2DDownsample.__init__)rf   ri   c                 C   s,   |  |}| |}| |}| |}|S r5   )r2  r  r  r  )r=   rf   outr-   r-   r.   rF   N
  s
    



zConv2DDownsample.forward)r   r   r%  T)r&   r'   r(   r)   rG   ry   r7   r*   rx   rF   rH   r-   r-   r?   r.   r  0
  s       r  r#  TFc              	      s   | j d }d tj fdd|D dd}| dddddf dddddf |dddddf  }t|dt|j dd g}|rttj| }n*tjttj| t	tj| gdd}|rtj| |
|ddgdd}|S )	a  
    Generate a Fourier frequency position encoding with linear spacing.

    Args:
      pos (`torch.LongTensor` of shape `(batch_size, sequence_length, dim)`):
        The Tensor containing the position of n points in d dimensional space.
      num_bands (`int`):
        The number of frequency bands (K) to use.
      max_resolution (`Tuple[int]`, *optional*, defaults to (224, 224)):
        The maximum resolution (i.e. the number of pixels per dim). A tuple representing resolution for each dimension.
      concat_pos (`bool`, *optional*, defaults to `True`):
        Whether to concatenate the input position encoding to the Fourier features.
      sine_only (`bool`, *optional*, defaults to `False`):
        Whether to use a single phase (sin) or two (sin/cos) for each frequency band.

    Returns:
      `torch.FloatTensor` of shape `(batch_size, sequence_length, n_channels)`: The Fourier position embeddings. If
      `concat_pos` is `True` and `sine_only` is `False`, output dimensions are ordered as: [dim_1, dim_2, ..., dim_d,
      sin(pi*f_1*dim_1), ..., sin(pi*f_K*dim_1), ..., sin(pi*f_1*dim_d), ..., sin(pi*f_K*dim_d), cos(pi*f_1*dim_1),
      ..., cos(pi*f_K*dim_1), ..., cos(pi*f_1*dim_d), ..., cos(pi*f_K*dim_d)], where dim_i is pos[:, i] and f_k is the
      kth frequency band.
    r   r   c                    s    g | ]}t j |d  dqS )r]   )startendsteps)r*   linspace)r   resZmin_freqr)  r-   r.   rn  s
  s     z-generate_fourier_features.<locals>.<listcomp>rk   NrD   r   )ro   r*   rs  ru  rq  r  sinpirv  cosrE   )ro  r)  r(  r'  r*  rB   Z
freq_bandsZper_pos_featuresr-   r  r.   generate_fourier_featuresV
  s"    
 >  r  g      r   c                    s:   fdd  fdd| D }t |ddi}tj|ddS )	a  
    Generate an array of position indices for an N-D input array.

    Args:
      index_dims (`List[int]`):
        The shape of the index dimensions of the input array.
      output_range (`Tuple[float]`, *optional*, defaults to `(-1.0, 1.0)`):
        The min and max values taken by each input index dimension.

    Returns:
      `torch.FloatTensor` of shape `(index_dims[0], index_dims[1], .., index_dims[-1], N)`.
    c                    s   t j d  d | t jdS )Nr   r   )r  r  r  r  )r*   r  Zfloat32)n_xels_per_dim)output_ranger-   r.   	_linspace
  s    z)build_linear_positions.<locals>._linspacec                    s   g | ]} |qS r-   r-   )r   r  )r  r-   r.   rn  
  s     z*build_linear_positions.<locals>.<listcomp>ZindexingZijrD   rk   )r   r*   rs  )r   r  Z
dim_rangesZarray_index_gridr-   )r  r  r.   build_linear_positions
  s    r  c                   @   sJ   e Zd ZdZeejedddZejedddZ	ejdd Z
d	S )
!PerceiverAbstractPositionEncodingz%Perceiver abstract position encoding.rl  c                 C   s   t d S r5   r[  r   r-   r-   r.   num_dimensions
  s    z0PerceiverAbstractPositionEncoding.num_dimensionsc                 O   s   t d S r5   r[  r  r-   r-   r.   rm  
  s    z-PerceiverAbstractPositionEncoding.output_sizec                 C   s   t d S r5   r[  )r=   rB   ro  r-   r-   r.   rF   
  s    z)PerceiverAbstractPositionEncoding.forwardN)r&   r'   r(   r)   r`  r^  r_  rG   r  rm  rF   r-   r-   r-   r.   r  
  s   r  c                       sT   e Zd ZdZd fdd	ZeedddZeddd	Zee	j
d
ddZ  ZS )r   zTrainable position encoding.re  c                    s8   t    || _|| _t|}tt	||| _
d S r5   )r6   r7   _num_channels_index_dimsrq  r  r   r8   r*   r9   r   )r=   r   r   Z	index_dimr?   r-   r.   r7   
  s
    

z+PerceiverTrainablePositionEncoding.__init__rl  c                 C   s   t | jtrdS t| jS )Nr   )r   r  rG   r   r   r-   r-   r.   r  
  s    z1PerceiverTrainablePositionEncoding.num_dimensionsc                 O   s   | j S r5   )r  r  r-   r-   r.   rm  
  s    z.PerceiverTrainablePositionEncoding.output_size)rB   ri   c                 C   s    | j }|d k	r||dd}|S rC   )r   rE   )r=   rB   r   r-   r-   r.   rF   
  s    z*PerceiverTrainablePositionEncoding.forward)re  )r&   r'   r(   r)   r7   r`  rG   r  rm  r*   rx   rF   rH   r-   r-   r?   r.   r   
  s   r   c                 C   s^   | dkr@t |} | d |f| j } t| |t|dg} n| jd t|krZtd| S )a  
    Checks or builds spatial position features (x, y, ...).

    Args:
      pos (`torch.FloatTensor`):
        None, or an array of position features. If None, position features are built. Otherwise, their size is checked.
      index_dims (`List[int]`):
        An iterable giving the spatial/index size of the data to be featurized.
      batch_size (`int`):
        The batch size of the data to be featurized.

    Returns:
        `torch.FloatTensor` of shape `(batch_size, prod(index_dims))` an array of position features.
    NrD   z5Spatial features have the wrong number of dimensions.)	r  rE   ro   r*   ru  rq  r  r   rL   )ro  r   rB   r-   r-   r.   !_check_or_build_spatial_positions
  s    r  c                       sb   e Zd ZdZd fdd	ZeedddZd	d
 Zde	e ee
je
je
je
jdddZ  ZS )rW  z'Fourier (Sinusoidal) position encoding.TFc                    s&   t    || _|| _|| _|| _d S r5   )r6   r7   r)  r(  r'  r*  )r=   r)  r(  r'  r*  r?   r-   r.   r7   
  s
    
z)PerceiverFourierPositionEncoding.__init__rl  c                 C   s
   t | jS r5   )r   r(  r   r-   r-   r.   r  
  s    z/PerceiverFourierPositionEncoding.num_dimensionsc                 C   s6   t | j}| j| }| js"|d9 }| jr2|| j7 }|S )z4Returns size of positional encodings last dimension.r]   )r   r(  r)  r*  r'  r  )r=   Znum_dimsZencoding_sizer-   r-   r.   rm  
  s    


z,PerceiverFourierPositionEncoding.output_sizeN)r   rB   r   r  ro  ri   c                 C   s4   t |||}t|| j| j| j| jdj||d}|S )N)r)  r(  r'  r*  rp  )r  r  r)  r(  r'  r*  to)r=   r   rB   r   r  ro  Zfourier_pos_encr-   r-   r.   rF     s     z(PerceiverFourierPositionEncoding.forward)TF)N)r&   r'   r(   r)   r7   r`  rG   r  rm  r   r*   r   r  r+   rF   rH   r-   r-   r?   r.   rW  
  s    rW  c                   @   s   e Zd ZeedddZdS )AbstractPreprocessorrl  c                 C   s
   t  dS )z$Returns size of preprocessor output.Nr[  r   r-   r-   r.   r     s    z!AbstractPreprocessor.num_channelsN)r&   r'   r(   r`  rG   r   r-   r-   r-   r.   r    s   r  c                       sV   e Zd ZdZedd fddZeedddZde	j
ee	j ed
ddZ  ZS )r   a*  
    Text preprocessing for Perceiver Encoder. Can be used to embed `inputs` and add positional encodings.

    The dimensionality of the embeddings is determined by the `d_model` attribute of the configuration.

    Args:
        config ([`PerceiverConfig`]):
            Model configuration.
    Nr>   ri   c                    s:   t    || _tj|j|jd| _t|j|j| _	d S )N)Znum_embeddingsZembedding_dim)
r6   r7   r>   r   r   r  r   r   r   r   r<   r?   r-   r.   r7   )  s    
z"PerceiverTextPreprocessor.__init__rl  c                 C   s   | j jS r5   )r>   r   r   r-   r-   r.   r   /  s    z&PerceiverTextPreprocessor.num_channelsTrf   ro  network_input_is_1dc                 C   s>   |  |}|jd }tjd||jd}|| | }|d |fS )Nr   r   r   )r   ro   r*   Zaranger   r   )r=   rf   ro  r  Zembeddings_without_posr   Zposition_idsr   r-   r-   r.   rF   3  s
    

z!PerceiverTextPreprocessor.forward)NT)r&   r'   r(   r)   r   r7   r`  rG   r   r*   Z
LongTensorr
   rx   ry   rF   rH   r-   r-   r?   r.   r     s
   
r   c                       s@   e Zd ZdZedd fddZejejejdddZ  Z	S )	r   z
    Module to decode embeddings (for masked language modeling).

    Args:
        config ([`PerceiverConfig`]):
            Model configuration.
    Nr  c                    s0   t    || _|j| _tt| j| _d S r5   )	r6   r7   r>   r  r   r8   r*   Zzerosr   r<   r?   r-   r.   r7   F  s    
z"PerceiverEmbeddingDecoder.__init__)r#   r  ri   c                 C   sH   |j \}}}t|d|g|jdd}|| j }|||| jgS )NrD   r   r   )ro   r*   rm   ru  r   rn   r   r  )r=   r#   r  rB   ru   r   r   r-   r-   r.   rF   L  s     
z!PerceiverEmbeddingDecoder.forward)
r&   r'   r(   r)   r   r7   r*   rx   rF   rH   r-   r-   r?   r.   r   =  s   r   c                       sX   e Zd ZdZd
eeef ed fddZde	j
ee	j
 eee	j
f ddd	Z  ZS )rR  a?  
    Multimodal postprocessing for Perceiver. Can be used to combine modality-specific postprocessors into a single
    postprocessor.

    Args:
          modalities (`Mapping[str, PostprocessorType]`):
            Dictionary mapping modality name to postprocessor class for that modality.
          input_is_dict (`bool`, *optional*, defaults to `False`):
            If True, input is assumed to be dictionary structured, and outputs keep the same dictionary shape. If
            False, input is a tensor which is sliced up during postprocessing by *modality_sizes*.
    F)rF  input_is_dictc                    s    t    t|| _|| _d S r5   )r6   r7   r   r  rF  r  )r=   rF  r  r?   r-   r.   r7   b  s    
z)PerceiverMultimodalPostprocessor.__init__Nrf   ro  ri   c                    s@   | j s"|d krtdt| d  fdd| j D }|S )Nz@Modality sizes should be specified if input is not a dictionary.)r   rf   c                    s$   i | ]\}}|| | d dqS )N)ro  r   r-   )r   r   Zpostprocessorrf   ro  r-   r.   r  p  s    z<PerceiverMultimodalPostprocessor.forward.<locals>.<dictcomp>)r  rL   r  rF  r   )r=   rf   ro  r   rw   r-   r  r.   rF   g  s    z(PerceiverMultimodalPostprocessor.forward)F)NN)r&   r'   r(   r)   r	   r   r   ry   r7   r*   rx   r
   rF   rH   r-   r-   r?   r.   rR  U  s       rR  c                       sD   e Zd ZdZeedd fddZd	eej	 ej	dddZ
  ZS )
rU  a  
    Classification postprocessing for Perceiver. Can be used to convert the decoder output to classification logits.

    Args:
        config ([*PerceiverConfig*]):
            Model configuration.
        in_channels (`int`):
            Number of channels in the input.
    N)r>   rL  ri   c                    s   t    t||j| _d S r5   )r6   r7   r   rU   r  rc  )r=   r>   rL  r?   r-   r.   r7     s    
z-PerceiverClassificationPostprocessor.__init__)ro  ri   c                 C   s    |  |}|d d dd d f S )Nr   rc  r=   rf   ro  r   r!   r-   r-   r.   rF     s    
z,PerceiverClassificationPostprocessor.forward)NN)r&   r'   r(   r)   r   rG   r7   r
   r*   rx   rF   rH   r-   r-   r?   r.   rU  w  s   
rU  c                       sL   e Zd ZdZd
eeedd fddZdej	e
ej	 ej	ddd	Z  ZS )rS  a  
    Audio postprocessing for Perceiver. Can be used to convert the decoder output to audio features.

    Args:
        config ([*PerceiverConfig*]):
            Model configuration.
        in_channels (`int`):
            Number of channels in the input.
        postproc_type (`str`, *optional*, defaults to `"patches"`):
            Postprocessor type to use. Currently, only "patches" is supported.
    r6  N)r>   rL  postproc_typeri   c                    s.   t    |dkrtdt||j| _d S )Nr6  zInvalid postproc_type!)r6   r7   rL   r   rU   rC  rc  )r=   r>   rL  r  r?   r-   r.   r7     s    
z$PerceiverAudioPostprocessor.__init__r  c                 C   s    |  |}t||jd dgS Nr   rD   )rc  r*   ru  ro   r  r-   r-   r.   rF     s    
z#PerceiverAudioPostprocessor.forward)r6  )NN)r&   r'   r(   r)   r   rG   r   r7   r*   rx   r
   rF   rH   r-   r-   r?   r.   rS    s   	rS  c                       sH   e Zd ZdZeedd fddZd	ejeej ejdddZ	  Z
S )
rT  a'  
    Projection postprocessing for Perceiver. Can be used to project the channels of the decoder output to a lower
    dimension.

    Args:
        in_channels (`int`):
            Number of channels in the input.
        out_channels (`int`):
            Number of channels in the output.
    N)rL  r  ri   c                    s   t    t||| _d S r5   )r6   r7   r   rU   rc  )r=   rL  r  r?   r-   r.   r7     s    
z)PerceiverProjectionPostprocessor.__init__r  c                 C   s   |  |}|S r5   r  r  r-   r-   r.   rF     s    
z(PerceiverProjectionPostprocessor.forward)NN)r&   r'   r(   r)   rG   r7   r*   rx   r
   rF   rH   r-   r-   r?   r.   rT    s   rT  c                       s|   e Zd ZdZdeeeeeeeeeed
 fddZeedddZ	de
jedddZde
jee
j edddZ  ZS )r  a  
    Image preprocessing for Perceiver Encoder.

    Note: the *out_channels* argument refers to the output channels of a convolutional layer, if *prep_type* is set to
    "conv1x1" or "conv". If one adds absolute position embeddings, one must make sure the *num_channels* of the
    position encoding kwargs are set equal to the *out_channels*.

    Args:
        config ([*PerceiverConfig*]):
            Model configuration.
        prep_type (`str`, *optional*, defaults to `"conv"`):
            Preprocessing type. Can be "conv1x1", "conv", "patches", "pixels".
        spatial_downsample (`int`, *optional*, defaults to 4):
            Spatial downsampling factor.
        temporal_downsample (`int`, *optional*, defaults to 1):
            Temporal downsampling factor (only relevant in case a time dimension is present).
        position_encoding_type (`str`, *optional*, defaults to `"fourier"`):
            Position encoding type. Can be "fourier" or "trainable".
        in_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input.
        out_channels (`int`, *optional*, defaults to 64):
            Number of channels in the output.
        conv_after_patching (`bool`, *optional*, defaults to `False`):
            Whether to apply a convolutional layer after patching.
        conv_after_patching_in_channels (`int`, *optional*, defaults to 54):
            Number of channels in the input of the convolutional layer after patching.
        conv2d_use_batchnorm (`bool`, *optional*, defaults to `True`):
            Whether to use batch normalization in the convolutional layer.
        concat_or_add_pos (`str`, *optional*, defaults to `"concat"`):
            How to concatenate the position encoding to the input. Can be "concat" or "add".
        project_pos_dim (`int`, *optional*, defaults to -1):
            Dimension of the position encoding to project to. If -1, no projection is applied.
        **position_encoding_kwargs (`Dict`, *optional*):
            Keyword arguments for the position encoding.
    r2  r   r   r3  r   r%  Fr7  Tr  rD   )
r  r:  r  rL  r  r8  r9  conv2d_use_batchnormr  r  c                    s8  t    || _|dkr(td| d|dkr@td| d|| _|| _|| _|| _|| _|| _	|| _
|| _| jdkrt|d}|t|k}|r|d	krtd
t|t|||
d| _n2| jdkr|d	krtdtj||d||fd| _|| _tf |||d|\| _| _|r*t|	| jnt | _d S )N)r2  r6  r+  r  
Prep_type z is invalidr  addzInvalid value z for concat_or_add_pos.r2  r   r   zYOnly powers of 4 expected for spatial and 1 expected for temporal downsampling with conv.)rL  r  r  r  r  z$Conv1x1 does not downsample in time.)r   r   )rL  r  r  r  r  r  r  )r6   r7   r>   rL   rL  r  r  r:  r  r  r8  r  rp   logrq  roundr  rG   convnetr   r   convnet_1x1r  rY  r   rX  rU   rS   conv_after_patches)r=   r>   r  r  r:  r  rL  r  r8  r9  r  r  r  ri  Zconvnet_num_layersZconvnet_num_layers_is_intr?   r-   r.   r7     s\    



		z#PerceiverImagePreprocessor.__init__rl  c                 C   s   | j jdk}| jdkr| j}n
| j  }| jdkr6|S | jsF| jdkrN| j}n\| jdkrt| j}|st	
|| j }n6| jdkr| jr| j}n| j| jd  }|r|| j9 }|| S )Nr]   r   r  )r  r2  r+  r6  )r   r  r  rm  r  r8  r  r  rL  rp   ceilr  r:  )r=   Zis_temporalpos_dimZinp_dimr-   r-   r.   r   *  s&    





z'PerceiverImagePreprocessor.num_channels)rf   r  c           	      C   s   |j d }|j dd }t|}t|j dkrF|rFt|||dg}| jdkr\| |}n | jdkr|| j|||j|j	d}| 
|}|s|j }t|t|dd dg }| jd	krtj||gdd
}n| jdkr|| }||fS )z
        Construct the final input, including position encoding.

        This method expects the inputs to always have channels as last dimension.

        r   r   rD   r   r  r3  rp  Nr  rk   r  )ro   rq  r  r   r*   ru  r  r   r   r  rX  r  r  rv  )	r=   rf   r  rB   r   rw  pos_encshinputs_with_posr-   r-   r.   _build_network_inputsK  s$    






z0PerceiverImagePreprocessor._build_network_inputsNr  c                 C   s`  | j dkr| |}n| j dkr,| |}n| j dkr|jdkr^|d d | jd d | jf }q|jdkr|d d d d | jd d d d | jd d | jf }qtdnJ| j dkrt|| j| jd}|jdkr|jd	 d	kr|j	d	d
}| 
|}| j dkrB|jdkr|dddd	}n(|jdkr:|dd	ddd}ntd| ||\}}d }|||fS )Nr2  r  r+  r   r  z#Unsupported data format for pixels.r6  )r  r  r   rk   r   r]   r   z$Unsupported data format for conv1x1.)r  r  r  ndimr  r:  rL   r  ro   r  r  r`   r  r=   rf   ro  r  r   r   r-   r-   r.   rF   n  sD    







  
z"PerceiverImagePreprocessor.forward)r2  r   r   r3  r   r%  Fr7  Tr  rD   )T)NT)r&   r'   r(   r)   rG   r   ry   r7   r`  r   r*   rx   r  r
   rF   rH   r-   r-   r?   r.   r    s8   '           J #r  c                       sV   e Zd ZdZedd fddZeedddZde	j
ee	j
 ed
ddZ  ZS )rO  z
    One-hot preprocessor for Perceiver Encoder. Can be used to add a dummy index dimension to the input.

    Args:
        config ([`PerceiverConfig`]):
            Model configuration.
    Nr  c                    s   t    || _d S r5   )r6   r7   r>   r<   r?   r-   r.   r7     s    
z$PerceiverOneHotPreprocessor.__init__rl  c                 C   s   | j jS r5   )r>   r  r   r-   r-   r.   r     s    z(PerceiverOneHotPreprocessor.num_channelsTr  c                 C   s    |d d d d d f }|d |fS r5   r-   )r=   rf   ro  r  r-   r-   r.   rF     s    z#PerceiverOneHotPreprocessor.forward)NT)r&   r'   r(   r)   r   r7   r`  rG   r   r*   rx   r
   ry   rF   rH   r-   r-   r?   r.   rO    s
   rO  c                       sd   e Zd ZdZdeeeed fd	d
ZeedddZdd Z	de
jee
j edddZ  ZS )rN  a'  
    Audio preprocessing for Perceiver Encoder.

    Args:
        config ([*PerceiverConfig*]):
            Model configuration.
        prep_type (`str`, *optional*, defaults to `"patches"`):
            Preprocessor type to use. Only "patches" is supported.
        samples_per_patch (`int`, *optional*, defaults to 96):
            Number of samples per patch.
        position_encoding_type (`str`, *optional*, defaults to `"fourier"`):
            Type of position encoding to use. Can be "trainable" or "fourier".
        concat_or_add_pos (`str`, *optional*, defaults to `"concat"`):
            How to concatenate the position encoding to the input. Can be "concat" or "add".
        out_channels (`int`, *optional*, defaults to 64):
            Number of channels in the output.
        project_pos_dim (`int`, *optional*, defaults to -1):
            Dimension of the position encoding to project to. If -1, no projection is applied.
        **position_encoding_kwargs (`Dict`, *optional*):
            Keyword arguments for the position encoding.
    r6  `   r3  r  r%  rD   )r  rC  r  r  c           	         sz   t    || _|dkr(td| d|dkr@td| d|| _|| _|| _|| _tf |||d|\| _	| _
d S )Nr  r  z# is invalid, can only be 'patches'.r  zConcat_or_pos z+ is invalid, can only be 'concat' or 'add'.r  )r6   r7   r>   rL   rC  r  r  r  rY  r   rX  )	r=   r>   r  rC  r  r  r  r  ri  r?   r-   r.   r7     s"    
z#PerceiverAudioPreprocessor.__init__rl  c                 C   s4   | j dkr| j }n
| j }| jdkr*|S | j| S )Nr   r  )r  r   rm  r  rC  )r=   r  r-   r-   r.   r     s    


z'PerceiverAudioPreprocessor.num_channelsc                 C   s   |j d }|j dd }| jdkr.| |}n | jdkrN| j|||j|jd}| |}| jdkrvtj||gdd}n| jd	kr|| }||fS )
z7Construct the final input, including position encoding.r   r   rD   r  r3  rp  r  rk   r  )	ro   r  r   r   r  rX  r  r*   rv  )r=   rf   rB   r   r  r  r-   r-   r.   r    s    





z0PerceiverAudioPreprocessor._build_network_inputsNTr  c                 C   s6   t ||jd d| jg}| |\}}d }|||fS r  )r*   ru  ro   rC  r  r  r-   r-   r.   rF     s    z"PerceiverAudioPreprocessor.forward)r6  r  r3  r  r%  rD   )NT)r&   r'   r(   r)   r   rG   r7   r`  r   r  r*   rx   r
   ry   rF   rH   r-   r-   r?   r.   rN    s"         !
rN  c                       sx   e Zd ZdZdeeef eeeef  e	d fddZ
ee	ddd	Zdeeejf eej eedddZ  ZS )rM  a  
    Multimodal preprocessing for Perceiver Encoder.

    Inputs for each modality are preprocessed, then padded with trainable position embeddings to have the same number
    of channels.

    Args:
        modalities (`Mapping[str, PreprocessorType]`):
            Dict mapping modality name to preprocessor.
        mask_probs (`Dict[str, float]`):
            Dict mapping modality name to masking probability of that modality.
        min_padding_size (`int`, *optional*, defaults to 2):
            The minimum padding size for all modalities. The final output will have num_channels equal to the maximum
            channels across all modalities plus min_padding_size.
    Nr]   )rF  rG  rE  c                    sp   t    t| _| _|d k	r(|ni  _t fdd| D  _	t fdd j D  _
d S )Nc              
      s,   i | ]$\}}|t td  j|j qS r  r   r8   r*   r9   r   )r   r   preprocessorr   r-   r.   r  7  s    z<PerceiverMultimodalPreprocessor.__init__.<locals>.<dictcomp>c              	      s&   i | ]\}}|t td  jqS r  r  )r   r   rv   r   r-   r.   r  =  s      )r6   r7   r   r  rF  rE  rG  r   r   r  mask)r=   rF  rG  rE  r?   r   r.   r7   ,  s    

z(PerceiverMultimodalPreprocessor.__init__rl  c                 C   s&   t dd | j D }|| j }|S )Nc                 s   s   | ]\}}|j V  qd S r5   )r   )r   rv   	processorr-   r-   r.   r   B  s     z?PerceiverMultimodalPreprocessor.num_channels.<locals>.<genexpr>r  r  r-   r-   r.   r   @  s    
z,PerceiverMultimodalPreprocessor.num_channelsT)rf   ro  r  ri   c                    s6  i  i }i }| j  D ]\}}||| ||d\}}	||< |j\}
}}| j| |
dd}t||
|| j| g}tj||gdd}|| j	kr| j
| |
dd}| j	| }tt|
|g|}tj|dd|j}d| | ||  }| |< |jd ||< q fddt  D }tj|dd}|||fS )N)ro  r  rD   r]   rk   r   c                    s   g | ]} | qS r-   r-   r  paddedr-   r.   rn  h  s     z;PerceiverMultimodalPreprocessor.forward.<locals>.<listcomp>)rF  r   ro   r  rE   r*   rt  r   rv  rG  r  Z	bernoullifullZ	unsqueezer  r   r  rs   )r=   rf   ro  r  r   r   r   r  r   rv   rB   Znum_samplesr   r  r  Zoutput_paddedZ
mask_tokenZ	mask_probr  Z	padded_lsZfinal_inputsr-   r  r.   rF   F  s6      

z'PerceiverMultimodalPreprocessor.forward)Nr]   )NT)r&   r'   r(   r)   r	   r   r   r
   floatrG   r7   r`  r   r*   rx   ry   PreprocessorOutputTyperF   rH   r-   r-   r?   r.   rM    s$     
     rM  )NrD   NN)r   r   )r#  TF)r  )rr)   r^  rp   dataclassesr   	functoolsr   operatorr   typingr   r   r   r   r	   r
   r   r   numpyrq  r*   Ztorch.utils.checkpointr   Ztorch.nnr   r   r   Zactivationsr   Zmodeling_outputsr   Zmodeling_utilsr   Zpytorch_utilsr   r   r   r   utilsr   r   r   r   r   Zconfiguration_perceiverr   r   rG   ZModalitySizeTyperx   r  r   r   Z
get_loggerr&   loggerZ_CHECKPOINT_FOR_DOCr   Z'PERCEIVER_PRETRAINED_MODEL_ARCHIVE_LISTr    r/   r0   r3   Moduler4   rI   rz   r   r   r   r   r   ZPERCEIVER_START_DOCSTRINGZPERCEIVER_MODEL_START_DOCSTRINGr   r   r   r  r  r"  r0  r4  rA  rY  ABCMetarZ  rb  r   r	  r>  rP  r  rQ  r  r   r  r  r  r  r  r   r  rW  r  r   r   rR  rU  rS  rT  r  rO  rN  rM  r-   r-   r-   r.   <module>   s   (
s
dBs" n o   s S    
% ;.#5o<&
4
," fa