U
    9%e,7                     @   sv   d Z ddlmZ ddlmZ eeZddddZG dd	 d	eZ	G d
d deZ
G dd deZG dd deZdS )z SAM model configuration   )PretrainedConfig)loggingzEhttps://huggingface.co/facebook/sam-vit-huge/resolve/main/config.jsonzFhttps://huggingface.co/facebook/sam-vit-large/resolve/main/config.jsonzEhttps://huggingface.co/facebook/sam-vit-base/resolve/main/config.json)zfacebook/sam-vit-hugezfacebook/sam-vit-largezfacebook/sam-vit-basec                       s"   e Zd ZdZd
 fdd		Z  ZS )SamPromptEncoderConfiga  
    This is the configuration class to store the configuration of a [`SamPromptEncoder`]. The [`SamPromptEncoder`]
    module is used to encode the input 2D points and bounding boxes. Instantiating a configuration defaults will yield
    a similar configuration to that of the SAM-vit-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        image_size (`int`, *optional*, defaults to 1024):
            The expected output resolution of the image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        mask_input_channels (`int`, *optional*, defaults to 16):
            The number of channels to be fed to the `MaskDecoder` module.
        num_point_embeddings (`int`, *optional*, defaults to 4):
            The number of point embeddings to be used.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the encoder and pooler.
                geluư>c           	         sF   t  jf | || _|| _|| _|| | _|| _|| _|| _|| _	d S N)
super__init__hidden_size
image_size
patch_sizeZimage_embedding_sizemask_input_channelsnum_point_embeddings
hidden_actlayer_norm_eps)	selfr   r   r   r   r   r   r   kwargs	__class__ h/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/sam/configuration_sam.pyr   8   s    
zSamPromptEncoderConfig.__init__)r   r   r   r   r   r	   r
   __name__
__module____qualname____doc__r   __classcell__r   r   r   r   r      s          r   c                
       s"   e Zd ZdZd fd	d
	Z  ZS )SamMaskDecoderConfiga  
    This is the configuration class to store the configuration of a [`SamMaskDecoder`]. It is used to instantiate a SAM
    mask decoder to the specified arguments, defining the model architecture. Instantiating a configuration defaults
    will yield a similar configuration to that of the SAM-vit-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        hidden_act (`str`, *optional*, defaults to `"relu"`):
            The non-linear activation function used inside the `SamMaskDecoder` module.
        mlp_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 2):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        attention_downsample_rate (`int`, *optional*, defaults to 2):
            The downsampling rate of the attention layer.
        num_multimask_outputs (`int`, *optional*, defaults to 3):
            The number of outputs from the `SamMaskDecoder` module. In the Segment Anything paper, this is set to 3.
        iou_head_depth (`int`, *optional*, defaults to 3):
            The number of layers in the IoU head module.
        iou_head_hidden_dim (`int`, *optional*, defaults to 256):
            The dimensionality of the hidden states in the IoU head module.
        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
            The epsilon used by the layer normalization layers.

    r   relu         r   r
   c                    sN   t  jf | || _|| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _d S r   )r   r   r   r   mlp_dimnum_hidden_layersnum_attention_headsattention_downsample_ratenum_multimask_outputsiou_head_depthiou_head_hidden_dimr   )r   r   r   r&   r'   r(   r)   r*   r+   r,   r   r   r   r   r   r   p   s    zSamMaskDecoderConfig.__init__)
r   r"   r#   r$   r%   r$   r   r   r   r
   r   r   r   r   r   r!   N   s   #          r!   c                       sP   e Zd ZdZddddddddd	d
ddddddddddgddf fdd	Z  ZS )SamVisionConfiga  
    This is the configuration class to store the configuration of a [`SamVisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of the SAM ViT-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        output_channels (`int`, *optional*, defaults to 256):
            Dimensionality of the output channels in the Patch Encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input image.
        image_size (`int`, *optional*, defaults to 1024):
            Expected resolution. Target size of the resized input image.
        patch_size (`int`, *optional*, defaults to 16):
            Size of the patches to be extracted from the input image.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string)
        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 1e-10):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        qkv_bias (`bool`, *optional*, defaults to `True`):
            Whether to add a bias to query, key, value projections.
        mlp_ratio (`float`, *optional*, defaults to 4.0):
            Ratio of mlp hidden dim to embedding dim.
        use_abs_pos (`bool`, *optional*, defaults to True):
            Whether to use absolute position embedding.
        use_rel_pos (`bool`, *optional*, defaults to True):
            Whether to use relative position embedding.
        window_size (`int`, *optional*, defaults to 14):
            Window size for relative position.
        global_attn_indexes (`List[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
            The indexes of the global attention layers.
        num_pos_feats (`int`, *optional*, defaults to 128):
            The dimensionality of the position embedding.
        mlp_dim (`int`, *optional*, defaults to None):
            The dimensionality of the MLP layer in the Transformer encoder. If `None`, defaults to `mlp_ratio *
            hidden_size`.
    i   r      r   r   r   r	   r
   g        g|=Tg      @   r$      r%         Nc                    s   t  jf | || _|| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _|| _|| _|| _|| _|| _|| _|| _|| _|d krt|| n|| _d S r   )r   r   r   output_channelsr'   r(   num_channelsr   r   r   r   attention_dropoutinitializer_rangeqkv_bias	mlp_ratiouse_abs_posuse_rel_poswindow_sizeglobal_attn_indexesnum_pos_featsintr&   )r   r   r3   r'   r(   r4   r   r   r   r   r5   r6   r7   r8   r9   r:   r;   r<   r=   r&   r   r   r   r   r      s(    zSamVisionConfig.__init__r   r   r   r   r   r-      s*   5
r-   c                       s&   e Zd ZdZdZd fdd	Z  ZS )	SamConfiga  
    [`SamConfig`] is the configuration class to store the configuration of a [`SamModel`]. It is used to instantiate a
    SAM model according to the specified arguments, defining the vision model, prompt-encoder model and mask decoder
    configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the
    SAM-ViT-H [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (Union[`dict`, `SamVisionConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamVisionConfig`].
        prompt_encoder_config (Union[`dict`, `SamPromptEncoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamPromptEncoderConfig`].
        mask_decoder_config (Union[`dict`, `SamMaskDecoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamMaskDecoderConfig`].

        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import (
    ...     SamVisionConfig,
    ...     SamPromptEncoderConfig,
    ...     SamMaskDecoderConfig,
    ...     SamModel,
    ... )

    >>> # Initializing a SamConfig with `"facebook/sam-vit-huge"` style configuration
    >>> configuration = SamConfig()

    >>> # Initializing a SamModel (with random weights) from the `"facebook/sam-vit-huge"` style configuration
    >>> model = SamModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a SamConfig from a SamVisionConfig, SamPromptEncoderConfig, and SamMaskDecoderConfig

    >>> # Initializing SAM vision, SAM Q-Former and language model configurations
    >>> vision_config = SamVisionConfig()
    >>> prompt_encoder_config = SamPromptEncoderConfig()
    >>> mask_decoder_config = SamMaskDecoderConfig()

    >>> config = SamConfig(vision_config, prompt_encoder_config, mask_decoder_config)
    ```ZsamN{Gz?c                    s   t  jf | |d k	r|ni }|d k	r*|ni }|d k	r:|ni }t|trP| }t|trb| }t|trt| }tf || _tf || _tf || _	|| _
d S r   )r   r   
isinstancer-   to_dictr   r!   vision_configprompt_encoder_configmask_decoder_configr6   )r   rC   rD   rE   r6   r   r   r   r   r   !  s    


zSamConfig.__init__)NNNr@   )r   r   r   r   Z
model_typer   r    r   r   r   r   r?      s   1    r?   N)r   Zconfiguration_utilsr   utilsr   Z
get_loggerr   loggerZ!SAM_PRETRAINED_CONFIG_ARCHIVE_MAPr   r!   r-   r?   r   r   r   r   <module>   s   
/=b