U
    9%e                     @   sR  d Z ddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlZddlZddlZddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* e$+e,Z-dZ.dZ/dgZ0eG dd de!Z1eG dd de!Z2dd Z3G dd dej4Z5G dd dej4Z6G dd dej4Z7G dd  d ej4Z8G d!d" d"ej4Z9G d#d$ d$ej4Z:G d%d& d&ej4Z;G d'd( d(ej4Z<G d)d* d*ej4Z=G d+d, d,eZ>d-Z?d.Z@e"d/e?G d0d1 d1e>ZAG d2d3 d3ej4ZBe"d4e?G d5d6 d6e>ZCe"d7e?G d8d9 d9e>ZDdS ):z- PyTorch VideoMAE (masked autoencoder) model.    N)deepcopy)	dataclass)OptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputImageClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD   )VideoMAEConfigr   zMCG-NJU/videomae-basec                   @   sL   e Zd ZU dZdZejed< dZe	e
ej  ed< dZe	e
ej  ed< dS )VideoMAEDecoderOutputaO  
    Class for VideoMAEDecoder's outputs, with potential hidden states and attentions.

    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
            Pixel reconstruction logits.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nlogitshidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r   r    r'   r'   m/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/videomae/modeling_videomae.pyr   8   s   
r   c                   @   s^   e Zd ZU dZdZeej ed< dZ	ejed< dZ
eeej  ed< dZeeej  ed< dS )VideoMAEForPreTrainingOutputa  
    Class for VideoMAEForPreTraining's outputs, with potential hidden states and attentions.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`):
            Pixel reconstruction loss.
        logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
            Pixel reconstruction logits.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nlossr   r   r   )r    r!   r"   r#   r*   r   r$   r%   r&   r   r   r   r   r'   r'   r'   r(   r)   O   s
   
r)   c                    s    fddt fddt| D }t |dddddf |dddddf< t |dddddf |dddddf< t|dS )	z Sinusoid position encoding tablec                    s    fddt D S )Nc              	      s(   g | ] }t d d|d     qS )i'     )nppower).0Zhid_j)d_hidpositionr'   r(   
<listcomp>p   s     zOget_sinusoid_encoding_table.<locals>.get_position_angle_vec.<locals>.<listcomp>)ranger0   )r/   r3   r(   get_position_angle_veco   s    z;get_sinusoid_encoding_table.<locals>.get_position_angle_vecc                    s   g | ]} |qS r'   r'   )r.   Zpos_i)r4   r'   r(   r1   r   s     z/get_sinusoid_encoding_table.<locals>.<listcomp>Nr   r+   r   )r,   arrayr2   sincosr$   r%   Z	unsqueeze)Z
n_positionr/   Zsinusoid_tabler'   )r/   r4   r(   get_sinusoid_encoding_tablek   s
    ..r8   c                       s(   e Zd ZdZ fddZdd Z  ZS )VideoMAEEmbeddingsz7
    Construct the patch and position embeddings.

    c                    s8   t    t|| _| jj| _t| j|j| _|| _d S N)	super__init__VideoMAEPatchEmbeddingspatch_embeddingsnum_patchesr8   hidden_sizeposition_embeddingsconfigselfrB   	__class__r'   r(   r<      s
    


zVideoMAEEmbeddings.__init__c                 C   sZ   |  |}|| j||j   }|d k	rV|j\}}}||  }||d|}|S )N)	r>   rA   type_astodeviceclonedetachshapereshape)rD   pixel_valuesbool_masked_pos
embeddings
batch_size_num_channelsr'   r'   r(   forward   s    
 
zVideoMAEEmbeddings.forwardr    r!   r"   r#   r<   rU   __classcell__r'   r'   rE   r(   r9   y   s   	r9   c                       s(   e Zd ZdZ fddZdd Z  ZS )r=   aw  
    Video to Patch Embedding. This module turns a batch of videos of shape (batch_size, num_frames, num_channels,
    height, width) into a tensor of shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.

    The seq_len (the number of patches) equals (number of frames // tubelet_size) * (height // patch_size) * (width //
    patch_size).

    c           	         s   t    |j}|j}|j}|j}|j}|j}t|t	j
jr@|n||f}t|t	j
jrZ|n||f}|| _|| _t|| _|d |d  |d |d   || j  }|| _|| _tj||| j|d |d f| j|d |d fd| _d S )Nr   r   )Zin_channelsZout_channelsZkernel_sizeZstride)r;   r<   
image_size
patch_sizerT   r@   
num_framestubelet_size
isinstancecollectionsabcIterableintr?   r   Conv3d
projection)	rD   rB   rX   rY   rT   r@   rZ   r[   r?   rE   r'   r(   r<      s,    

(z VideoMAEPatchEmbeddings.__init__c              
   C   s   |j \}}}}}|| jkr"td|| jd ks>|| jd krltd| d| d| jd  d| jd  d	|dddd	d
}| |ddd}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r+   r      )rM   rT   
ValueErrorrX   permuterb   flatten	transpose)rD   rO   rR   rZ   rT   heightwidthrQ   r'   r'   r(   rU      s    
(zVideoMAEPatchEmbeddings.forwardrV   r'   r'   rE   r(   r=      s   	r=   c                       sl   e Zd Zedd fddZejejdddZdeej e	e
eejejf eej f d	d
dZ  ZS )VideoMAESelfAttentionNrB   returnc                    s   t    |j|j dkr@t|ds@td|jf d|j d|j| _t|j|j | _| j| j | _t	j
|j| jdd| _t	j
|j| jdd| _t	j
|j| jdd| _|jrt	t| j| _t	t| j| _nd | _d | _t	|j| _d S )Nr   Zembedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .Fbias)r;   r<   r@   num_attention_headshasattrre   r`   attention_head_sizeall_head_sizer   LinearquerykeyvalueZqkv_bias	Parameterr$   zerosq_biasv_biasDropoutZattention_probs_dropout_probdropoutrC   rE   r'   r(   r<      s"    
zVideoMAESelfAttention.__init__)xrm   c                 C   s6   |  d d | j| jf }||}|ddddS )NrG   r   r+   r   r   )sizerq   rs   viewrf   )rD   r   Znew_x_shaper'   r'   r(   transpose_for_scores   s    
z*VideoMAESelfAttention.transpose_for_scoresF)	head_maskoutput_attentionsrm   c                 C   s*  | j d k	rtj| jddnd }tjj|| jj|d}tjj|| j	j| jd}tjj|| j
j| j d}| |}| |}	| |}
t|
|dd}|t| j }tjj|dd}| |}|d k	r|| }t||	}|ddd	d
 }| d d | jf }||}|r ||fn|f}|S )NF)Zrequires_grad)inputweightrp   rG   dimr   r+   r   r   )r{   r$   Z
zeros_liker|   r   Z
functionalZlinearrw   r   rx   rv   r   matmulrh   mathsqrtrs   Zsoftmaxr~   rf   
contiguousr   rt   r   )rD   r   r   r   Zk_biaskeysvaluesZqueriesZ	key_layerZvalue_layerZquery_layerZattention_scoresZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr'   r'   r(   rU      s&    




zVideoMAESelfAttention.forward)NF)r    r!   r"   r   r<   r$   Tensorr   r   boolr   r   rU   rW   r'   r'   rE   r(   rk      s       rk   c                       s@   e Zd ZdZedd fddZejejejdddZ  Z	S )	VideoMAESelfOutputz
    The residual connection is defined in VideoMAELayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    Nrl   c                    s.   t    t|j|j| _t|j| _d S r:   )	r;   r<   r   ru   r@   denser}   hidden_dropout_probr~   rC   rE   r'   r(   r<     s    
zVideoMAESelfOutput.__init__r   input_tensorrm   c                 C   s   |  |}| |}|S r:   r   r~   rD   r   r   r'   r'   r(   rU      s    

zVideoMAESelfOutput.forward)
r    r!   r"   r#   r   r<   r$   r   rU   rW   r'   r'   rE   r(   r     s   r   c                       sp   e Zd Zedd fddZee ddddZdej	e
ej	 eeeej	ej	f eej	 f d	d
dZ  ZS )VideoMAEAttentionNrl   c                    s*   t    t|| _t|| _t | _d S r:   )r;   r<   rk   	attentionr   outputsetpruned_headsrC   rE   r'   r(   r<   )  s    


zVideoMAEAttention.__init__)headsrm   c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )lenr   r   rq   rs   r   r   rv   rw   rx   r   r   rt   union)rD   r   indexr'   r'   r(   prune_heads/  s       zVideoMAEAttention.prune_headsFr   r   r   rm   c                 C   s4   |  |||}| |d |}|f|dd   }|S )Nr   r   )r   r   )rD   r   r   r   Zself_outputsattention_outputr   r'   r'   r(   rU   A  s    zVideoMAEAttention.forward)NF)r    r!   r"   r   r<   r   r`   r   r$   r   r   r   r   r   rU   rW   r'   r'   rE   r(   r   (  s     r   c                       s8   e Zd Zedd fddZejejdddZ  ZS )VideoMAEIntermediateNrl   c                    sB   t    t|j|j| _t|jt	r6t
|j | _n|j| _d S r:   )r;   r<   r   ru   r@   intermediate_sizer   r\   Z
hidden_actstrr   intermediate_act_fnrC   rE   r'   r(   r<   Q  s
    
zVideoMAEIntermediate.__init__)r   rm   c                 C   s   |  |}| |}|S r:   )r   r   )rD   r   r'   r'   r(   rU   Y  s    

zVideoMAEIntermediate.forward	r    r!   r"   r   r<   r$   r   rU   rW   r'   r'   rE   r(   r   P  s   r   c                       s<   e Zd Zedd fddZejejejdddZ  ZS )VideoMAEOutputNrl   c                    s.   t    t|j|j| _t|j| _	d S r:   )
r;   r<   r   ru   r   r@   r   r}   r   r~   rC   rE   r'   r(   r<   b  s    
zVideoMAEOutput.__init__r   c                 C   s    |  |}| |}|| }|S r:   r   r   r'   r'   r(   rU   g  s    

zVideoMAEOutput.forwardr   r'   r'   rE   r(   r   a  s   r   c                       s`   e Zd ZdZedd fddZd
ejeej e	e
eejejf eej f ddd	Z  ZS )VideoMAELayerz?This corresponds to the Block class in the timm implementation.Nrl   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   Zeps)r;   r<   Zchunk_size_feed_forwardZseq_len_dimr   r   r   intermediater   r   r   	LayerNormr@   layer_norm_epslayernorm_beforelayernorm_afterrC   rE   r'   r(   r<   t  s    



zVideoMAELayer.__init__Fr   c                 C   s`   | j | |||d}|d }|dd  }|| }| |}| |}| ||}|f| }|S )Nr   r   r   )r   r   r   r   r   )rD   r   r   r   Zself_attention_outputsr   r   Zlayer_outputr'   r'   r(   rU   ~  s    


zVideoMAELayer.forward)NF)r    r!   r"   r#   r   r<   r$   r   r   r   r   r   rU   rW   r'   r'   rE   r(   r   q  s     r   c                	       sN   e Zd Zedd fddZd
ejeej eeee	e
ef ddd	Z  ZS )VideoMAEEncoderNrl   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r'   r   r.   rS   rB   r'   r(   r1     s     z,VideoMAEEncoder.__init__.<locals>.<listcomp>F)	r;   r<   rB   r   
ModuleListr2   num_hidden_layerslayergradient_checkpointingrC   rE   r   r(   r<     s    
 zVideoMAEEncoder.__init__FT)r   r   r   output_hidden_statesreturn_dictrm   c                    s   |rdnd } rdnd }t | jD ]\}}	|r8||f }|d k	rH|| nd }
| jr|| jr| fdd}tjj||	||
}n|	||
 }|d } r"||d f }q"|r||f }|stdd |||fD S t|||dS )	Nr'   c                    s    fdd}|S )Nc                     s    | f S r:   r'   inputsmoduler   r'   r(   custom_forward  s    zNVideoMAEEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr'   r   r   r   r   r(   create_custom_forward  s    z6VideoMAEEncoder.forward.<locals>.create_custom_forwardr   r   c                 s   s   | ]}|d k	r|V  qd S r:   r'   r.   vr'   r'   r(   	<genexpr>  s      z*VideoMAEEncoder.forward.<locals>.<genexpr>Zlast_hidden_stater   r   )		enumerater   r   trainingr$   utils
checkpointtupler   )rD   r   r   r   r   r   all_hidden_statesall_self_attentionsilayer_moduleZlayer_head_maskr   layer_outputsr'   r   r(   rU     s4    

zVideoMAEEncoder.forward)NFFT)r    r!   r"   r   r<   r$   r   r   r   r   r   r   rU   rW   r'   r'   rE   r(   r     s   	    
r   c                   @   s2   e Zd ZdZeZdZdZdZdd Z	ddd	Z
d
S )VideoMAEPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    videomaerO   Tc                 C   sj   t |tjtjfr@|jjjd| jjd |j	dk	rf|j	j
  n&t |tjrf|j	j
  |jjd dS )zInitialize the weightsg        )meanstdNg      ?)r\   r   ru   ra   r   dataZnormal_rB   Zinitializer_rangerp   Zzero_r   Zfill_)rD   r   r'   r'   r(   _init_weights  s    
z%VideoMAEPreTrainedModel._init_weightsFc                 C   s   t |tr||_d S r:   )r\   r   r   )rD   r   rx   r'   r'   r(   _set_gradient_checkpointing  s    
z3VideoMAEPreTrainedModel._set_gradient_checkpointingN)F)r    r!   r"   r#   r   config_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingr   r   r'   r'   r'   r(   r     s   r   aJ  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`VideoMAEConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a\  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`VideoMAEImageProcessor.__call__`] for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zbThe bare VideoMAE Model transformer outputting raw hidden-states without any specific head on top.c                       s~   e Zd Z fddZdd Zdd Zeeee	e
ddejeej eej ee ee ee eee	f d	d
dZ  ZS )VideoMAEModelc                    sT   t  | || _t|| _t|| _|jr4d | _nt	j
|j|jd| _|   d S )Nr   )r;   r<   rB   r9   rQ   r   encoderuse_mean_pooling	layernormr   r   r@   r   	post_initrC   rE   r'   r(   r<     s    

zVideoMAEModel.__init__c                 C   s   | j jS r:   )rQ   r>   )rD   r'   r'   r(   get_input_embeddings'  s    z"VideoMAEModel.get_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )rD   Zheads_to_pruner   r   r'   r'   r(   _prune_heads*  s    zVideoMAEModel._prune_headsoutput_typer   NrO   rP   r   r   r   r   rm   c           
      C   s   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| || j j}| ||}| j|||||d}|d }	| jdk	r| |	}	|s|	f|dd  S t	|	|j
|jdS )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
            batch must have the same number of masked patches. If `None`, then all patches are considered. Sequence
            length is `(num_frames // tubelet_size) * (image_size // patch_size) ** 2`.

        Returns:

        Examples:

        ```python
        >>> import av
        >>> import numpy as np

        >>> from transformers import AutoImageProcessor, VideoMAEModel
        >>> from huggingface_hub import hf_hub_download

        >>> np.random.seed(0)


        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`List[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
        ...     '''
        ...     Sample a given number of frame indices from the video.
        ...     Args:
        ...         clip_len (`int`): Total number of frames to sample.
        ...         frame_sample_rate (`int`): Sample every n-th frame.
        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
        ...     Returns:
        ...         indices (`List[int]`): List of sampled frame indices
        ...     '''
        ...     converted_len = int(clip_len * frame_sample_rate)
        ...     end_idx = np.random.randint(converted_len, seg_len)
        ...     start_idx = end_idx - converted_len
        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
        ...     return indices


        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
        >>> file_path = hf_hub_download(
        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample 16 frames
        >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
        >>> video = read_video_pyav(container, indices)

        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
        >>> model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")

        >>> # prepare video for the model
        >>> inputs = image_processor(list(video), return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 1568, 768]
        ```Nr   r   r   r   r   r   r   )rB   r   r   use_return_dictZget_head_maskr   rQ   r   r   r   r   r   )
rD   rO   rP   r   r   r   r   Zembedding_outputZencoder_outputssequence_outputr'   r'   r(   rU   2  s.    \

zVideoMAEModel.forward)NNNNN)r    r!   r"   r<   r   r   r   VIDEOMAE_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr$   r%   r   
BoolTensorr   r   r   r   rU   rW   r'   r'   rE   r(   r     s&   
     
r   c                       s&   e Zd Z fddZdddZ  ZS )VideoMAEDecoderc                    s   t    |j|j |jd  }t| |j _|j _	|j
 _|j _t fddt|jD | _t|j| _|dkrt|j|nt | _d| _|| _d S )Nr+   c                    s   g | ]}t  qS r'   r   r   Zdecoder_configr'   r(   r1     s     z,VideoMAEDecoder.__init__.<locals>.<listcomp>r   F)r;   r<   rT   r[   rY   r   decoder_hidden_sizer@   Zdecoder_num_hidden_layersr   Zdecoder_num_attention_headsrq   Zdecoder_intermediate_sizer   r   r   r2   decoder_layersr   normru   Identityheadr   rB   )rD   rB   r?   Zdecoder_num_labelsrE   r   r(   r<     s    
zVideoMAEDecoder.__init__FTc                    s   |rdnd } rdnd }t | jD ]n\}}	|r8||f }| jrh| jrh fdd}
tjj|
|	|d }n|	|d  d}|d } r"||d f }q"|r||f }|dkr|d d | d f }| |}| |}|st	dd |||fD S t
|||d	S )
Nr'   c                    s    fdd}|S )Nc                     s    | f S r:   r'   r   r   r'   r(   r     s    zNVideoMAEDecoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr'   r   r   r   r(   r     s    z6VideoMAEDecoder.forward.<locals>.create_custom_forward)r   r   r   r   c                 s   s   | ]}|d k	r|V  qd S r:   r'   r   r'   r'   r(   r     s      z*VideoMAEDecoder.forward.<locals>.<genexpr>)r   r   r   )r   r   r   r   r$   r   r   r   r   r   r   )rD   r   Zreturn_token_numr   r   r   r   r   r   r   r   r   r   r'   r   r(   rU     s2    	



zVideoMAEDecoder.forward)FFT)r    r!   r"   r<   rU   rW   r'   r'   rE   r(   r     s
      r   zXThe VideoMAE Model transformer with the decoder on top for self-supervised pre-training.c                       sj   e Zd Z fddZeeeeedde	j
e	jee	j ee ee ee eeef dddZ  ZS )	VideoMAEForPreTrainingc                    s~   t  | || _t|| _tj|j|jdd| _	t
tdd|j| _t| jjj|j| _t|| jjjd| _|   d S )NFro   r   )r?   )r;   r<   rB   r   r   r   ru   r@   r   encoder_to_decoderry   r$   rz   
mask_tokenr8   rQ   r?   rA   r   decoderr   rC   rE   r'   r(   r<     s    
 zVideoMAEForPreTraining.__init__r   Nr   c           "      C   sF  |dk	r|n| j j}| j||||||d}|d }| |}|j\}	}
}|dkrXtd| j|	dd|}|	|j
  }||  |	d|}|| |	d|}tj|| | j| gdd}| ||jd }|j}d}t  | j jdkr|}nV|j
}tt	|ddddddf }tt	|ddddddf }|| | }|j\}	}}}}| j j| j j }}| j jrF||	|| |||| ||| |}|ddd	d
dddd }||	|| | | | | || | |}||jddd |jdddd  d  }||	|| | | | | || | | }n| j jdkr\td||	|| |||| ||| |}|ddd	d
dddd }||	|| | | | | || | | }|j\}	}}|| |	d|}W 5 Q R X t! } | ||}|s2|f|dd  }!|dk	r.|f|! S |!S t"|||j#|j$dS )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
            batch must have the same number of masked patches. Sequence length is `(num_frames // tubelet_size) *
            (image_size // patch_size) ** 2`.

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, VideoMAEForPreTraining
        >>> import numpy as np
        >>> import torch

        >>> num_frames = 16
        >>> video = list(np.random.randint(0, 256, (num_frames, 3, 224, 224)))

        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
        >>> model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base")

        >>> pixel_values = image_processor(video, return_tensors="pt").pixel_values

        >>> num_patches_per_frame = (model.config.image_size // model.config.patch_size) ** 2
        >>> seq_length = (num_frames // model.config.tubelet_size) * num_patches_per_frame
        >>> bool_masked_pos = torch.randint(0, 2, (1, seq_length)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss = outputs.loss
        ```N)rP   r   r   r   r   r   z!One must provided a boolean mask rG   r   r   r   rd      r+         r   T)r   keepdim)r   Zunbiasedr   gư>zQCan't unnormalize non-RGB images. Consider setting config.norm_pix_loss to False.r*   r   r   r   )%rB   r   r   r   rM   re   rA   expandrH   rI   rJ   rK   rL   rN   r$   catr   r   r   Zno_gradrT   Z	as_tensorr   r   r[   rY   Znorm_pix_lossr   rf   r   r   varr   r   r)   r   r   )"rD   rO   rP   r   r   r   r   r   r   rR   Zseq_lenrT   Zexpanded_position_embeddingsZpos_emb_visibleZpos_emb_maskZx_fullZdecoder_outputsr   r*   framesrJ   r   r   timeri   rj   r[   rY   Zframes_normZvideos_patchrS   labelsloss_fctr   r'   r'   r(   rU     s    (	""


zVideoMAEForPreTraining.forward)NNNN)r    r!   r"   r<   r   r   r   r)   r   r$   r%   r   r   r   r   r   r   rU   rW   r'   r'   rE   r(   r     s    
    
r   zVideoMAE Model transformer with a video classification head on top (a linear layer on top of the average pooled hidden
    states of all tokens) e.g. for ImageNet.c                       sr   e Zd Z fddZeeeeedde	e
j e	e
j e	e
j e	e e	e e	e eeef dddZ  ZS )	VideoMAEForVideoClassificationc                    sf   t  | |j| _t|| _|jr0t|jnd | _	|jdkrPt
|j|jnt | _|   d S )Nr   )r;   r<   
num_labelsr   r   r   r   r   r@   fc_normru   r   
classifierr   rC   rE   r'   r(   r<     s    
$z'VideoMAEForVideoClassification.__init__r   N)rO   r   r  r   r   r   rm   c                 C   s  |dk	r|n| j j}| j|||||d}|d }| jdk	rL| |d}n|dddf }| |}	d}
|dk	rT| j jdkr| jdkrd| j _n4| jdkr|jt	j
ks|jt	jkrd| j _nd| j _| j jdkrt }| jdkr||	 | }
n
||	|}
nN| j jdkr6t }||	d| j|d}
n| j jdkrTt }||	|}
|s|	f|dd  }|
dk	r|
f| S |S t|
|	|j|jd	S )
a3  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> import av
        >>> import torch
        >>> import numpy as np

        >>> from transformers import AutoImageProcessor, VideoMAEForVideoClassification
        >>> from huggingface_hub import hf_hub_download

        >>> np.random.seed(0)


        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`List[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
        ...     '''
        ...     Sample a given number of frame indices from the video.
        ...     Args:
        ...         clip_len (`int`): Total number of frames to sample.
        ...         frame_sample_rate (`int`): Sample every n-th frame.
        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
        ...     Returns:
        ...         indices (`List[int]`): List of sampled frame indices
        ...     '''
        ...     converted_len = int(clip_len * frame_sample_rate)
        ...     end_idx = np.random.randint(converted_len, seg_len)
        ...     start_idx = end_idx - converted_len
        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
        ...     return indices


        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
        >>> file_path = hf_hub_download(
        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample 16 frames
        >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
        >>> video = read_video_pyav(container, indices)

        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
        >>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")

        >>> inputs = image_processor(list(video), return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        ...     logits = outputs.logits

        >>> # model predicts one of the 400 Kinetics-400 classes
        >>> predicted_label = logits.argmax(-1).item()
        >>> print(model.config.id2label[predicted_label])
        eating spaghetti
        ```Nr   r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationrG   r   )rB   r   r   r  r   r  Zproblem_typer  Zdtyper$   longr`   r   Zsqueezer
   r   r	   r   r   r   )rD   rO   r   r  r   r   r   r   r   r   r*   r  r   r'   r'   r(   rU     sR    _




"


z&VideoMAEForVideoClassification.forward)NNNNNN)r    r!   r"   r<   r   r   r   r   r   r   r$   r   r   r   r   rU   rW   r'   r'   rE   r(   r    s$   
      
r  )Er#   collections.abcr]   r   copyr   dataclassesr   typingr   r   r   r   numpyr,   r$   Ztorch.utils.checkpointr   Ztorch.nnr	   r
   r   Zactivationsr   Zmodeling_outputsr   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   r   r   r   r   r   Zutils.constantsr   r   Zconfiguration_videomaer   Z
get_loggerr    loggerr   Z_CHECKPOINT_FOR_DOCZ&VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LISTr   r)   r8   Moduler9   r=   rk   r   r   r   r   r   r   r   ZVIDEOMAE_START_DOCSTRINGr   r   r   r   r  r'   r'   r'   r(   <module>   sr   
 5G(+9 J 4