U
    ,-e                     @   s  d Z ddlZddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZmZmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' e$(e)Z*dZ+dZ,dddgZ-dZ.dZ/dgZ0eG dd deZ1dNej2e3e4ej2dddZ5G dd dej6Z7G dd dej6Z8G d d! d!ej6Z9G d"d# d#ej6Z:G d$d% d%ej6Z;G d&d' d'ej6Z<G d(d) d)ej6Z=G d*d+ d+ej6Z>G d,d- d-ej6Z?G d.d/ d/ej6Z@G d0d1 d1ej6ZAG d2d3 d3eZBd4ZCd5ZDe"d6eCG d7d8 d8eBZEG d9d: d:ej6ZFe"d;eCG d<d= d=eBZGe"d>eCG d?d@ d@eBZHG dAdB dBej6ZIG dCdD dDej6ZJG dEdF dFej6ZKG dGdH dHej6ZLG dIdJ dJej6ZMe"dKeCG dLdM dMeBZNdS )Oz PyTorch BEiT model.    N)	dataclass)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedLMOutputSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesmeshgridprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )
BeitConfigr   z%microsoft/beit-base-patch16-224-pt22k   i   zmicrosoft/beit-base-patch16-224ztabby, tabby catc                   @   s   e Zd ZdZdS )BeitModelOutputWithPoolinga  
    Class for outputs of [`BeitModel`].

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
            *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
            will be returned.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    N)__name__
__module____qualname____doc__ r#   r#   g/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/beit/modeling_beit.pyr   C   s   r           F)input	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r%   r   r   r   )dtypedevice)shapendimtorchZrandr+   r,   Zfloor_div)r&   r'   r(   Z	keep_probr-   Zrandom_tensoroutputr#   r#   r$   	drop_path]   s    
r2   c                       sP   e Zd ZdZdee dd fddZejejdddZ	e
d	d
dZ  ZS )BeitDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).N)r'   r)   c                    s   t    || _d S N)super__init__r'   )selfr'   	__class__r#   r$   r6   t   s    
zBeitDropPath.__init__hidden_statesr)   c                 C   s   t || j| jS r4   )r2   r'   r(   r7   r;   r#   r#   r$   forwardx   s    zBeitDropPath.forwardr)   c                 C   s   d | jS )Nzp={})formatr'   r7   r#   r#   r$   
extra_repr{   s    zBeitDropPath.extra_repr)N)r   r    r!   r"   r   floatr6   r/   Tensorr=   strrA   __classcell__r#   r#   r8   r$   r3   q   s   r3   c                       sF   e Zd ZdZedd fddZd	ejeej	 ejdddZ
  ZS )
BeitEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    Nconfigr)   c                    s   t    ttdd|j| _|jrBttdd|j| _	nd | _	t
|| _| jj}|jr~ttd|d |j| _nd | _t|j| _d S )Nr   )r5   r6   r   	Parameterr/   zeroshidden_size	cls_tokenZuse_mask_token
mask_tokenBeitPatchEmbeddingspatch_embeddingsnum_patchesZ use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probdropout)r7   rH   rP   r8   r#   r$   r6      s    

zBeitEmbeddings.__init__)pixel_valuesbool_masked_posr)   c           
      C   s   |  |}| \}}}| j|dd}|d k	rd| j||d}|d|}	|d|	  ||	  }tj||fdd}| j	d k	r|| j	 }| 
|}|S Nr   dim)rO   sizerL   expandrM   	unsqueezeZtype_asr/   catrQ   rT   )
r7   rU   rV   
embeddings
batch_sizeZseq_len_Z
cls_tokensZmask_tokenswr#   r#   r$   r=      s    



zBeitEmbeddings.forward)N)r   r    r!   r"   r   r6   r/   rC   r   
BoolTensorr=   rE   r#   r#   r8   r$   rF      s   rF   c                       s4   e Zd ZdZ fddZejejdddZ  ZS )rN   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j }}|j|j }}t|tjj	r8|n||f}t|tjj	rR|n||f}|d |d  |d |d   }|d |d  |d |d  f}|| _|| _|| _|| _
|| _tj||||d| _d S )Nr   r   kernel_sizeZstride)r5   r6   
image_size
patch_sizenum_channelsrK   
isinstancecollectionsabcIterablerP   patch_shaper   Conv2d
projection)r7   rH   rf   rg   rh   rK   rP   rm   r8   r#   r$   r6      s    
  zBeitPatchEmbeddings.__init__)rU   r)   c              
   C   s   |j \}}}}|| jkr td|| jd ks<|| jd krjtd| d| d| jd  d| jd  d	| |ddd}|S )	NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).   )r-   rh   
ValueErrorrf   ro   flatten	transpose)r7   rU   r`   rh   heightwidthr_   r#   r#   r$   r=      s    
(zBeitPatchEmbeddings.forward)	r   r    r!   r"   r6   r/   rC   r=   rE   r#   r#   r8   r$   rN      s   rN   c                
       sr   e Zd Zdeee dd fddZdd Zdej	eej	 e
ed eeej	 eej	ej	f f d	d
dZ  ZS )BeitSelfAttentionNrH   window_sizer)   c                    s   t    |j|j dkr@t|ds@td|jf d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	j
|j| jdd| _t	
|j| j| _t	|j| _|rt||d| _nd | _d S )	Nr   Zembedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .F)biasry   )r5   r6   rK   num_attention_headshasattrrr   intattention_head_sizeall_head_sizer   LinearquerykeyvaluerR   Zattention_probs_dropout_probrT   BeitRelativePositionBiasrelative_position_biasr7   rH   ry   r8   r#   r$   r6      s    
zBeitSelfAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )NrX   r   rq   r   r   )r[   r}   r   viewpermute)r7   xZnew_x_shaper#   r#   r$   transpose_for_scores   s    
z&BeitSelfAttention.transpose_for_scoresFr   r;   	head_maskoutput_attentionsr   r)   c                 C   s  |  |}| | |}| | |}| |}t||dd}	|	t| j	 }	| j
d k	rt|	| 
 d }	|d k	r|	| }	tjj|	dd}
| |
}
|d k	r|
| }
t|
|}|dddd }| d d | jf }|j| }|r||
fn|f}|S )NrX   r   rY   rq   r   r   )r   r   r   r   r/   matmulrt   mathsqrtr   r   r]   r   
functionalZsoftmaxrT   r   
contiguousr[   r   r   )r7   r;   r   r   r   Zmixed_query_layerZ	key_layerZvalue_layerZquery_layerZattention_scoresZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr#   r#   r$   r=      s(    




zBeitSelfAttention.forward)N)NFN)r   r    r!   r   r   tupler6   r   r/   rC   boolr   r   r=   rE   r#   r#   r8   r$   rw      s      rw   c                       sB   e Zd ZdZedd fddZd	ejejejdddZ  Z	S )
BeitSelfOutputz
    The residual connection is defined in BeitLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    NrG   c                    s.   t    t|j|j| _t|j| _d S r4   )	r5   r6   r   r   rK   denserR   rS   rT   r7   rH   r8   r#   r$   r6   $  s    
zBeitSelfOutput.__init__)r;   input_tensorr)   c                 C   s   |  |}| |}|S r4   r   rT   )r7   r;   r   gammar#   r#   r$   r=   )  s    

zBeitSelfOutput.forward)N)
r   r    r!   r"   r   r6   r/   rC   r=   rE   r#   r#   r8   r$   r     s   r   c                
       sr   e Zd Zdeee dd fddZdd Zdej	eej	 e
ed eeej	 eej	ej	f f d	d
dZ  ZS )BeitAttentionNrx   c                    s.   t    t||d| _t|| _t | _d S )Nr|   )r5   r6   rw   	attentionr   r1   setpruned_headsr   r8   r#   r$   r6   1  s    

zBeitAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rY   )lenr   r   r}   r   r   r   r   r   r   r1   r   r   union)r7   headsindexr#   r#   r$   prune_heads7  s       zBeitAttention.prune_headsFr   r   c                 C   s6   |  ||||}| |d |}|f|dd   }|S )Nr   r   )r   r1   )r7   r;   r   r   r   Zself_outputsattention_outputr   r#   r#   r$   r=   I  s    zBeitAttention.forward)N)NFN)r   r    r!   r   r   r   r6   r   r/   rC   r   r   r   r=   rE   r#   r#   r8   r$   r   0  s      r   c                       s8   e Zd Zedd fddZejejdddZ  ZS )BeitIntermediateNrG   c                    sB   t    t|j|j| _t|jt	r6t
|j | _n|j| _d S r4   )r5   r6   r   r   rK   intermediate_sizer   ri   Z
hidden_actrD   r   intermediate_act_fnr   r8   r#   r$   r6   Y  s
    
zBeitIntermediate.__init__r:   c                 C   s   |  |}| |}|S r4   )r   r   r<   r#   r#   r$   r=   a  s    

zBeitIntermediate.forward	r   r    r!   r   r6   r/   rC   r=   rE   r#   r#   r8   r$   r   X  s   r   c                       s8   e Zd Zedd fddZejejdddZ  ZS )
BeitOutputNrG   c                    s.   t    t|j|j| _t|j| _	d S r4   )
r5   r6   r   r   r   rK   r   rR   rS   rT   r   r8   r#   r$   r6   i  s    
zBeitOutput.__init__r:   c                 C   s   |  |}| |}|S r4   r   r<   r#   r#   r$   r=   n  s    

zBeitOutput.forwardr   r#   r#   r8   r$   r   h  s   r   c                
       sp   e Zd ZdZdeee edd fddZde	j
ee	j
 eed eee	j
 ee	j
e	j
f f d	d
dZ  ZS )	BeitLayerz?This corresponds to the Block class in the timm implementation.Nr%   )rH   ry   drop_path_rater)   c                    s   t    |j| _d| _t||d| _t|| _t|| _	t
j|j|jd| _|dkr^t|nt
 | _t
j|j|jd| _|j}|dkrt
j|t|j dd| _t
j|t|j dd| _nd\| _| _d S )	Nr   r|   Zepsr%   r   T)Zrequires_grad)NN)r5   r6   Zchunk_size_feed_forwardZseq_len_dimr   r   r   intermediater   r1   r   	LayerNormrK   layer_norm_epslayernorm_beforer3   Identityr2   layernorm_afterZlayer_scale_init_valuerI   r/   Zoneslambda_1lambda_2)r7   rH   ry   r   Zinit_valuesr8   r#   r$   r6   x  s    


zBeitLayer.__init__Fr   r   c           	      C   s   | j | ||||d}|d }|dd  }| jd k	r@| j| }| || }| |}| |}| |}| jd k	r| j| }| || }|f| }|S )N)r   r   r   r   )r   r   r   r2   r   r   r1   r   )	r7   r;   r   r   r   Zself_attention_outputsr   r   Zlayer_outputr#   r#   r$   r=     s&    







zBeitLayer.forward)Nr%   )NFN)r   r    r!   r"   r   r   r   rB   r6   r/   rC   r   r   r   r=   rE   r#   r#   r8   r$   r   u  s      r   c                       s6   e Zd Zeedd fddZejdddZ  Z	S )r   Nrx   c           	         s  t    || _d|d  d d|d  d  d | _tt| j|j| _	t
|d }t
|d }tt||gdd}t|d}|d d d d d f |d d d d d f  }|ddd }|d d d d df  |d d 7  < |d d d d df  |d d 7  < |d d d d df  d|d  d 9  < tj|d |d  d fd |jd}|d|dd dd f< | jd |ddd f< | jd |dd df< | jd |d	< | jd
|dd d S )Nrq   r   r   r   Zij)Zindexing)r[   r+   rX   )r   r   relative_position_indexF)
persistent)r5   r6   ry   Znum_relative_distancer   rI   r/   rJ   r}   relative_position_bias_tableZarangestackr   rs   r   r   r+   sumZregister_buffer)	r7   rH   ry   Zcoords_hZcoords_wZcoordsZcoords_flattenZrelative_coordsr   r8   r#   r$   r6     s0    
&,&&* z!BeitRelativePositionBias.__init__r>   c                 C   sV   | j | jd | jd | jd  d | jd | jd  d d}|ddd S )NrX   r   r   rq   )r   r   r   ry   r   r   )r7   r   r#   r#   r$   r=     s      z BeitRelativePositionBias.forward)
r   r    r!   r   r   r6   r/   rC   r=   rE   r#   r#   r8   r$   r     s   r   c                	       sV   e Zd Zd
eee dd fddZdejeej e	e	e	e
eef ddd	Z  ZS )BeitEncoderNrx   c                    sv   t     | _ jr&t d| _nd | _dd td j j	D t
 fddt j	D | _d| _d S )Nr|   c                 S   s   g | ]}|  qS r#   )item.0r   r#   r#   r$   
<listcomp>  s     z(BeitEncoder.__init__.<locals>.<listcomp>r   c                    s(   g | ] }t   jrnd | dqS )N)ry   r   )r   Zuse_relative_position_biasr   irH   Zdprry   r#   r$   r     s   F)r5   r6   rH   Z!use_shared_relative_position_biasr   r   r/   Zlinspacer   num_hidden_layersr   
ModuleListrangelayergradient_checkpointingr   r8   r   r$   r6     s    

zBeitEncoder.__init__FT)r;   r   r   output_hidden_statesreturn_dictr)   c                    s   |rdnd } rdnd }t | jD ]\}}	|r8||f }|d k	rH|| nd }
| jr|| jr| fdd}tjj||	||
}n$| jd k	r|  nd }|	||
 |}|d } r"||d f }q"|r||f }|stdd |||fD S t	|||dS )	Nr#   c                    s    fdd}|S )Nc                     s    | f S r4   r#   )inputs)moduler   r#   r$   custom_forward  s    zJBeitEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr#   )r   r   r   )r   r$   create_custom_forward  s    z2BeitEncoder.forward.<locals>.create_custom_forwardr   r   c                 s   s   | ]}|d k	r|V  qd S r4   r#   )r   vr#   r#   r$   	<genexpr>  s      z&BeitEncoder.forward.<locals>.<genexpr>)last_hidden_stater;   
attentions)
	enumerater   r   r(   r/   utils
checkpointr   r   r   )r7   r;   r   r   r   r   Zall_hidden_statesZall_self_attentionsr   Zlayer_moduleZlayer_head_maskr   Zlayer_outputsr   r#   r   r$   r=     s8    

zBeitEncoder.forward)N)NFFT)r   r    r!   r   r   r   r6   r/   rC   r   r   r   r=   rE   r#   r#   r8   r$   r     s       
r   c                   @   s2   e Zd ZdZeZdZdZdZdd Z	ddd	Z
d
S )BeitPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    beitrU   Tc                 C   s   t |tjtjtjfrD|jjjd| jj	d |j
dk	r|j
j  nft |tjr|jjjd| jj	d |jdk	r|jj|j   n&t |tjr|j
j  |jjd dS )zInitialize the weightsr%   )meanZstdNg      ?)ri   r   r   rn   ConvTranspose2dweightdataZnormal_rH   Zinitializer_ranger{   Zzero_Z	EmbeddingZpadding_idxr   Zfill_)r7   r   r#   r#   r$   _init_weights/  s    

z!BeitPreTrainedModel._init_weightsFc                 C   s   t |tr||_d S r4   )ri   r   r   )r7   r   r   r#   r#   r$   _set_gradient_checkpointing?  s    
z/BeitPreTrainedModel._set_gradient_checkpointingN)F)r   r    r!   r"   r   config_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingr   r   r#   r#   r#   r$   r   $  s   r   aF  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`BeitConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aL  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`BeitImageProcessor.__call__`] for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z^The bare Beit Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Zdeedd fddZdd Zdd	 Zee	e
eeed
eddeej eej eej ee ee ee eeef dddZ  ZS )	BeitModelTN)rH   add_pooling_layerr)   c                    sp   t  | || _t|| _t|| jjjd| _|j	r>t
 nt
j|j|jd| _|r^t|nd | _|   d S )Nr|   r   )r5   r6   rH   rF   r_   r   rO   rm   encoderuse_mean_poolingr   r   r   rK   r   	layernorm
BeitPoolerpooler	post_init)r7   rH   r   r8   r#   r$   r6   k  s    
zBeitModel.__init__c                 C   s   | j jS r4   )r_   rO   r@   r#   r#   r$   get_input_embeddingsz  s    zBeitModel.get_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r7   Zheads_to_pruner   r   r#   r#   r$   _prune_heads}  s    zBeitModel._prune_headsZvision)r   output_typer   Zmodalityexpected_output)rU   rV   r   r   r   r   r)   c                 C   s   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}|dkrLtd| || j j}| ||}| j|||||d}|d }	| 	|	}	| j
dk	r| 
|	nd}
|s|
dk	r|	|
fn|	f}||dd  S t|	|
|j|jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_valuesr   r   r   r   r   r   )r   pooler_outputr;   r   )rH   r   r   use_return_dictrr   Zget_head_maskr   r_   r   r   r   r   r;   r   )r7   rU   rV   r   r   r   r   Zembedding_outputZencoder_outputssequence_outputpooled_outputZhead_outputsr#   r#   r$   r=     s6    
zBeitModel.forward)T)NNNNNN)r   r    r!   r   r   r6   r   r   r   BEIT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r/   rC   rc   r   r   r=   rE   r#   r#   r8   r$   r   f  s4   	      
r   c                       s8   e Zd Zedd fddZejejdddZ  ZS )r   NrG   c                    s,   t    |jr"tj|j|jdnd | _d S )Nr   )r5   r6   r   r   r   rK   r   r   r   r8   r#   r$   r6     s    
zBeitPooler.__init__r:   c                 C   sJ   | j d k	r6|d d dd d d f }|  |d}n|d d df }|S )Nr   r   )r   r   )r7   r;   Zpatch_tokensr   r#   r#   r$   r=     s
    
zBeitPooler.forwardr   r#   r#   r8   r$   r     s   r   a  Beit Model transformer with a 'language' modeling head on top. BEiT does masked image modeling by predicting
    visual tokens of a Vector-Quantize Variational Autoencoder (VQ-VAE), whereas other vision models like ViT and DeiT
    predict RGB pixel values. As a result, this class is incompatible with [`AutoModelForMaskedImageModeling`], so you
    will need to use [`BeitForMaskedImageModeling`] directly if you wish to do masked image modeling with BEiT.c                       s   e Zd Zedd fddZeeeee	dd	e
ej e
ej e
ej e
ej e
e e
e e
e eeef dddZ  ZS )
BeitForMaskedImageModelingNrG   c                    sT   t  | |j| _t|dd| _tj|j|jd| _	t
|j|j| _|   d S )NFr   r   )r5   r6   
num_labelsr   r   r   r   rK   r   r   r   Z
vocab_sizelm_headr   r   r8   r#   r$   r6     s    z#BeitForMaskedImageModeling.__init__r   r   )rU   rV   r   labelsr   r   r   r)   c                 C   s   |dk	r|n| j j}| j||||||d}|d }	| |	}	| |	ddddf }
d}|dk	rvt }||
| |}|s|
f|dd  }|dk	r|f| S |S t||
|j|jdS )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, BeitForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
        >>> model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, logits = outputs.loss, outputs.logits
        >>> list(logits.shape)
        [1, 196, 8192]
        ```N)rV   r   r   r   r   r   r   losslogitsr;   r   )	rH   r   r   r   r   r	   r   r;   r   )r7   rU   rV   r   r   r   r   r   r   r   Zprediction_scoresZmasked_lm_lossloss_fctr1   r#   r#   r$   r=     s2    /	
z"BeitForMaskedImageModeling.forward)NNNNNNN)r   r    r!   r   r6   r   r   r   r   r   r   r/   rC   rc   r   r   r   r=   rE   r#   r#   r8   r$   r     s(   
       
r   z
    Beit Model transformer with an image classification head on top (a linear layer on top of the average of the final
    hidden states of the patch tokens) e.g. for ImageNet.
    c                       s~   e Zd Zedd fddZeeeee	e
edd	eej eej eej ee ee ee eee	f dddZ  ZS )
BeitForImageClassificationNrG   c                    sR   t  | |j| _t|dd| _|jdkr<t|j|jnt | _	| 
  d S )NTr   r   )r5   r6   r   r   r   r   r   rK   r   
classifierr   r   r8   r#   r$   r6   B  s
    $z#BeitForImageClassification.__init__)r   r   r   r   rU   r   r   r   r   r   r)   c                 C   st  |dk	r|n| j j}| j|||||d}|r2|jn|d }| |}	d}
|dk	r0| j jdkr| jdkrrd| j _n4| jdkr|jtj	ks|jtj
krd| j _nd| j _| j jdkrt }| jdkr||	 | }
n
||	|}
nN| j jdkrt }||	d| j|d}
n| j jdkr0t }||	|}
|s`|	f|dd  }|
dk	r\|
f| S |S t|
|	|j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   Z
regressionZsingle_label_classificationZmulti_label_classificationrX   rq   r   )rH   r   r   r   r  Zproblem_typer   r+   r/   longr   r
   Zsqueezer	   r   r   r   r;   r   )r7   rU   r   r   r   r   r   r   r   r   r   r  r1   r#   r#   r$   r=   N  sL    



"


z"BeitForImageClassification.forward)NNNNNN)r   r    r!   r   r6   r   r   r   _IMAGE_CLASS_CHECKPOINTr   r   _IMAGE_CLASS_EXPECTED_OUTPUTr   r/   rC   r   r   r   r=   rE   r#   r#   r8   r$   r  :  s.         
r  c                       sz   e Zd ZdZdeeeeeeef f eeeeef ef eeeeeef f dd fddZ	e
je
jd	d
dZ  ZS )BeitConvModuleaD  
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    r   Fr   N)in_channelsout_channelsre   paddingr{   dilationr)   c                    s<   t    tj||||||d| _t|| _t | _d S )N)r	  r
  re   r  r{   r  )	r5   r6   r   rn   convBatchNorm2dbnZReLU
activation)r7   r	  r
  re   r  r{   r  r8   r#   r$   r6     s    	
zBeitConvModule.__init__r&   r)   c                 C   s"   |  |}| |}| |}|S r4   )r  r  r  )r7   r&   r1   r#   r#   r$   r=     s    


zBeitConvModule.forward)r   Fr   )r   r    r!   r"   r   r   r   rD   r   r6   r/   rC   r=   rE   r#   r#   r8   r$   r    s      r  c                       s<   e Zd Zeeedd fddZejejdddZ  ZS )BeitPyramidPoolingBlockN)
pool_scaler	  channelsr)   c                    sL   t    t|t||ddg| _t| jD ]\}}| t|| q.d S )Nr   re   )	r5   r6   r   ZAdaptiveAvgPool2dr  layersr   
add_modulerD   )r7   r  r	  r  r   r   r8   r#   r$   r6     s    
z BeitPyramidPoolingBlock.__init__r  c                 C   s   |}| j D ]}||}q
|S r4   )r  )r7   r&   Zhidden_stater   r#   r#   r$   r=     s    

zBeitPyramidPoolingBlock.forward)	r   r    r!   r   r6   r/   rC   r=   rE   r#   r#   r8   r$   r    s   	r  c                       sN   e Zd ZdZeedf eeedd fddZej	e
ej	 ddd	Z  ZS )
BeitPyramidPoolingModulea  
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module.
        in_channels (int): Input channels.
        channels (int): Channels after modules, before conv_seg.
        align_corners (bool): align_corners argument of F.interpolate.

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    .N)pool_scalesr	  r  align_cornersr)   c                    sh   t    || _|| _|| _|| _g | _t|D ]2\}}t|||d}| j	| | 
t|| q0d S )N)r  r	  r  )r5   r6   r  r  r	  r  blocksr   r  appendr  rD   )r7   r  r	  r  r  r   r  blockr8   r#   r$   r6     s    
z!BeitPyramidPoolingModule.__init__)r   r)   c                 C   sH   g }| j D ]8}||}tjj|| dd  d| jd}|| q
|S )Nrq   bilinearr[   moder  )r  r   r   interpolater[   r  r  )r7   r   Zppm_outsppmZppm_outZupsampled_ppm_outr#   r#   r$   r=     s    
   z BeitPyramidPoolingModule.forward)r   r    r!   r"   r   r   r   r6   r/   rC   r   r=   rE   r#   r#   r8   r$   r    s   "r  c                       sD   e Zd ZdZedd fddZdd Zejejdd	d
Z	  Z
S )BeitUperHeadz
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://arxiv.org/abs/1807.10221).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    NrG   c                    s  t    |j| _|jgd | _|j| _d| _tj| j|j	dd| _
t| j| jd | j| jd| _t| jd t| j| j  | jddd| _t | _t | _| jd d D ]@}t|| jdd}t| j| jddd}| j| | j| qtt| j| j | jddd| _d S )	N   Fr   r  rX   )r  r   re   r  )r5   r6   r  rK   r	  r  r  r   rn   r   r  r  psp_modulesr  r   
bottleneckr   lateral_convs	fpn_convsr  fpn_bottleneck)r7   rH   r	  Zl_convZfpn_convr8   r#   r$   r6     s>    


zBeitUperHead.__init__c                 C   s:   |d }|g}| | | tj|dd}| |}|S rW   )extendr&  r/   r^   r'  )r7   r   r   Zpsp_outsr1   r#   r#   r$   psp_forward  s    
zBeitUperHead.psp_forwardencoder_hidden_statesr)   c                    s   fddt jD   t}t|d ddD ]H}|d  jdd  }|d  tjj	| |dj
d |d < q@fd	dt|d D }|d  t|d ddD ]0}tjj	|| |d jdd  dj
d||< qtj|dd
}|}|}|S )Nc                    s   g | ]\}}| | qS r#   r#   )r   r   Zlateral_conv)r.  r#   r$   r   '  s     z(BeitUperHead.forward.<locals>.<listcomp>r   r   rX   rq   r  r  c                    s   g | ]}j |  | qS r#   )r)  r   )lateralsr7   r#   r$   r   4  s     rY   )r   r(  r  r,  r   r   r-   r   r   r!  r  r/   r^   r*  r  )r7   r.  Zused_backbone_levelsr   Z
prev_shapeZfpn_outsr1   r#   )r.  r/  r7   r$   r=   %  s0          

zBeitUperHead.forward)r   r    r!   r"   r   r6   r,  r/   rC   r=   rE   r#   r#   r8   r$   r#    s   &	r#  c                	       sT   e Zd ZdZdeeeeeeeef f dd fddZe	j
e	j
d	d
dZ  ZS )BeitFCNHeada  
    Fully Convolution Networks for Semantic Segmentation. This head is implemented of
    [FCNNet](https://arxiv.org/abs/1411.4038>).

    Args:
        config (BeitConfig): Configuration.
        in_channels
        kernel_size (int): The kernel size for convs in the head. Default: 3.
        dilation (int): The dilation rate for convs in the head. Default: 1.


    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rq   r   r   N)rH   in_indexre   r  r)   c              
      s   t    |j| _|j| _|j| _|j| _	|| _
|d | }g }|t| j| j|||d t| jd D ] }|t| j| j|||d qj| jdkrt | _ntj| | _| j	rt| j| j | j||d d| _tj| j|jdd| _d S )Nrq   )re   r  r  r   r   r%  r  )r5   r6   rK   r	  Zauxiliary_channelsr  Zauxiliary_num_convsZ	num_convsZauxiliary_concat_inputconcat_inputr1  r  r  r   r   r   convs
Sequentialconv_catrn   r   r  )r7   rH   r1  re   r  Zconv_paddingr3  r   r8   r#   r$   r6   R  sL    
        

   zBeitFCNHead.__init__r-  c                 C   s@   || j  }| |}| jr2| tj||gdd}| |}|S )Nr   rY   )r1  r3  r2  r5  r/   r^   r  )r7   r.  r;   r1   r#   r#   r$   r=   t  s    


zBeitFCNHead.forward)rq   r   r   )r   r    r!   r"   r   r   r   r   r6   r/   rC   r=   rE   r#   r#   r8   r$   r0  C  s           "r0  zf
    Beit Model transformer with a semantic segmentation head on top e.g. for ADE20k, CityScapes.
    c                       s   e Zd Zedd fddZdd Zeeee	e
ddeej eej eej ee ee ee eee	f dd	d
Z  ZS )BeitForSemanticSegmentationNrG   c                    s   t  | |j| _t|dd| _ttj|j|jdddt	|jt
 tj|j|jddd| _ttj|j|jddd| _t | _tjddd| _t|| _|jrt|nd | _|   d S )NFr   rq   rd   )r5   r6   r   r   r   r   r4  r   rK   r  ZGELUfpn1fpn2r   fpn3Z	MaxPool2dfpn4r#  decode_headZuse_auxiliary_headr0  auxiliary_headr   r   r8   r#   r$   r6     s"    


z$BeitForSemanticSegmentation.__init__c           
      C   s   t jj||jdd  ddd}|d k	rDt jj||jdd  ddd}t| jjd}|||}|}|d k	r|||}	|| jj|	 7 }|S )Nr   r  Fr  )Zignore_index)r   r   r!  r-   r	   rH   Zsemantic_loss_ignore_indexZauxiliary_loss_weight)
r7   r   auxiliary_logitsr   Zupsampled_logitsZupsampled_auxiliary_logitsr  Z	main_lossr   Zauxiliary_lossr#   r#   r$   compute_loss  s(          

z(BeitForSemanticSegmentation.compute_lossr   r  c                    s  |dk	r|nj j}|dk	r |nj j}j|||d|d}|rF|jn|d }fddt|D }	|jd  j jj j  fdd|	D }	j	j
jjg}
tt|	D ]}|
| |	| |	|< q|	}d}jdk	r|	}d}|dk	r"j jdkrtd	n|||}|sl|rB|f|dd  }n|f|d
d  }|dk	rh|f| S |S t|||r~|jnd|jdS )aV  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, BeitForSemanticSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
        >>> model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```NTr   r   c                    s$   g | ]\}}|d   j jkr|qS r*   )rH   Zout_indices)r   idxfeaturer@   r#   r$   r     s      z7BeitForSemanticSegmentation.forward.<locals>.<listcomp>r   c                    s<   g | ]4}|d d dd d d f  ddd dqS )Nr   r   rq   rX   )r   Zreshaper   )r`   patch_resolutionr#   r$   r     s    z/The number of labels should be greater than onerq   r   )rH   r   r   r   r;   r   r-   rf   rg   r7  r8  r9  r:  r   r   r;  r<  r   rr   r>  r   r   )r7   rU   r   r   r   r   r   r   r.  featuresZopsr   r   r=  r   r1   r#   )r`   rA  r7   r$   r=     sP    $





z#BeitForSemanticSegmentation.forward)NNNNNN)r   r    r!   r   r6   r>  r   r   r   r   r   r   r/   rC   r   r   r   r=   rE   r#   r#   r8   r$   r6  ~  s&   
      
r6  )r%   F)Or"   collections.abcrj   r   dataclassesr   typingr   r   r   r   r/   Ztorch.utils.checkpointr   Ztorch.nnr   r	   r
   Zactivationsr   Zmodeling_outputsr   r   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   r   r   r   r   r   r   Zconfiguration_beitr   Z
get_loggerr   loggerr   r   r   r  r  Z"BEIT_PRETRAINED_MODEL_ARCHIVE_LISTr   rC   rB   r   r2   Moduler3   rF   rN   rw   r   r   r   r   r   r   r   r   ZBEIT_START_DOCSTRINGr   r   r   r   r  r  r  r  r#  r0  r6  r#   r#   r#   r$   <module>   s   

)'M(=&L Y]R%%U;