U
    ,:%eT'                     @   s   d dl mZmZ d dlZdgZejejdddZG dd dejjZ	G d	d
 d
ejjZ
G dd dejjZG dd dejjZdS )    )OptionalTupleN	Conformer)lengthsreturnc                 C   sF   | j d }tt|  }tj|| j| jd||| 	dk}|S )Nr   )devicedtype   )
shapeinttorchmaxitemZaranger   r   expandZ	unsqueeze)r   Z
batch_size
max_lengthZpadding_mask r   Z/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/torchaudio/models/conformer.py_lengths_to_padding_mask	   s    
 r   c                	       sH   e Zd ZdZdeeeeeedd fddZej	ej	dd	d
Z
  ZS )_ConvolutionModulea  Conformer convolution module.

    Args:
        input_dim (int): input dimension.
        num_channels (int): number of depthwise convolution layer input channels.
        depthwise_kernel_size (int): kernel size of depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        bias (bool, optional): indicates whether to add bias term to each convolution layer. (Default: ``False``)
        use_group_norm (bool, optional): use GroupNorm rather than BatchNorm. (Default: ``False``)
            FN)	input_dimnum_channelsdepthwise_kernel_sizedropoutbiasuse_group_normr   c                    s   t    |d d dkr"tdtj|| _tjtjj|d| ddd|dtjj	ddtjj|||d|d d ||d|rtjj
d|dn
tj|tj tjj||ddd|d	tj|| _d S )
Nr	      r   z<depthwise_kernel_size must be odd to achieve 'SAME' padding.)stridepaddingr   )dim)r   r   groupsr   )Z
num_groupsr   )Zkernel_sizer   r   r   )super__init__
ValueErrorr   nn	LayerNorm
layer_norm
SequentialZConv1dZGLUZ	GroupNormZBatchNorm1dSiLUDropout
sequential)selfr   r   r   r   r   r   	__class__r   r   r"      sJ    	




z_ConvolutionModule.__init__inputr   c                 C   s,   |  |}|dd}| |}|ddS )z
        Args:
            input (torch.Tensor): with shape `(B, T, D)`.

        Returns:
            torch.Tensor: output, with shape `(B, T, D)`.
        r	   r   )r&   	transposer*   )r+   r/   xr   r   r   forwardM   s    

z_ConvolutionModule.forward)r   FF)__name__
__module____qualname____doc__r   floatboolr"   r   Tensorr2   __classcell__r   r   r,   r   r      s      /r   c                       sB   e Zd ZdZd
eeedd fddZejejddd	Z	  Z
S )_FeedForwardModulezPositionwise feed forward layer.

    Args:
        input_dim (int): input dimension.
        hidden_dim (int): hidden dimension.
        dropout (float, optional): dropout probability. (Default: 0.0)
    r   N)r   
hidden_dimr   r   c                    s`   t    tjtj|tjj||ddtj tj|tjj||ddtj|| _	d S )NT)r   )
r!   r"   r   r$   r'   r%   ZLinearr(   r)   r*   )r+   r   r<   r   r,   r   r   r"   d   s    



z_FeedForwardModule.__init__r.   c                 C   s
   |  |S )z
        Args:
            input (torch.Tensor): with shape `(*, D)`.

        Returns:
            torch.Tensor: output, with shape `(*, D)`.
        )r*   )r+   r/   r   r   r   r2   o   s    z_FeedForwardModule.forward)r   )r3   r4   r5   r6   r   r7   r"   r   r9   r2   r:   r   r   r,   r   r;   [   s   r;   c                
       sf   e Zd ZdZdeeeeeeedd fddZej	ej	dd	d
Z
ej	eej	 ej	dddZ  ZS )ConformerLayera  Conformer layer that constitutes Conformer.

    Args:
        input_dim (int): input dimension.
        ffn_dim (int): hidden layer dimension of feedforward network.
        num_attention_heads (int): number of attention heads.
        depthwise_conv_kernel_size (int): kernel size of depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
            in the convolution module. (Default: ``False``)
        convolution_first (bool, optional): apply the convolution module ahead of
            the attention module. (Default: ``False``)
    r   FN)r   ffn_dimnum_attention_headsdepthwise_conv_kernel_sizer   r   convolution_firstr   c                    s   t    t|||d| _tj|| _tjj|||d| _	tj
|| _t||||d|d| _t|||d| _tj|| _|| _d S )N)r   T)r   r   r   r   r   r   )r!   r"   r;   ffn1r   r$   r%   self_attn_layer_normZMultiheadAttention	self_attnr)   self_attn_dropoutr   conv_moduleffn2final_layer_normrA   )r+   r   r>   r?   r@   r   r   rA   r,   r   r   r"      s     

	zConformerLayer.__init__r.   c                 C   s2   |}| dd}| |}| dd}|| }|S )Nr   r	   )r0   rF   )r+   r/   residualr   r   r   _apply_convolution   s    
z!ConformerLayer._apply_convolution)r/   key_padding_maskr   c                 C   s   |}|  |}|d | }| jr*| |}|}| |}| j||||dd\}}| |}|| }| jsr| |}|}| |}|d | }| |}|S )a
  
        Args:
            input (torch.Tensor): input, with shape `(T, B, D)`.
            key_padding_mask (torch.Tensor or None): key padding mask to use in self attention layer.

        Returns:
            torch.Tensor: output, with shape `(T, B, D)`.
        g      ?F)querykeyvaluerK   Zneed_weights)rB   rA   rJ   rC   rD   rE   rG   rH   )r+   r/   rK   rI   r1   _r   r   r   r2      s.    	







zConformerLayer.forward)r   FF)r3   r4   r5   r6   r   r7   r8   r"   r   r9   rJ   r   r2   r:   r   r   r,   r   r=   z   s       r=   c                
       sX   e Zd ZdZd
eeeeeeeed fddZej	ej	e
ej	ej	f ddd	Z  ZS )r   a(  Conformer architecture introduced in
    *Conformer: Convolution-augmented Transformer for Speech Recognition*
    :cite:`gulati2020conformer`.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads in each Conformer layer.
        ffn_dim (int): hidden layer dimension of feedforward networks.
        num_layers (int): number of Conformer layers to instantiate.
        depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
            in the convolution module. (Default: ``False``)
        convolution_first (bool, optional): apply the convolution module ahead of
            the attention module. (Default: ``False``)

    Examples:
        >>> conformer = Conformer(
        >>>     input_dim=80,
        >>>     num_heads=4,
        >>>     ffn_dim=128,
        >>>     num_layers=4,
        >>>     depthwise_conv_kernel_size=31,
        >>> )
        >>> lengths = torch.randint(1, 400, (10,))  # (batch,)
        >>> input = torch.rand(10, int(lengths.max()), input_dim)  # (batch, num_frames, input_dim)
        >>> output = conformer(input, lengths)
    r   F)r   	num_headsr>   
num_layersr@   r   r   rA   c	           	   	      s:   t    tj fddt|D | _d S )Nc                    s"   g | ]}t  d qS ))r   r   rA   )r=   ).0rO   rA   r@   r   r>   r   rP   r   r   r   
<listcomp>  s   
z&Conformer.__init__.<locals>.<listcomp>)r!   r"   r   r$   Z
ModuleListrangeconformer_layers)	r+   r   rP   r>   rQ   r@   r   r   rA   r,   rS   r   r"      s    

zConformer.__init__)r/   r   r   c                 C   s:   t |}|dd}| jD ]}|||}q|dd|fS )aX  
        Args:
            input (torch.Tensor): with shape `(B, T, input_dim)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.

        Returns:
            (torch.Tensor, torch.Tensor)
                torch.Tensor
                    output frames, with shape `(B, T, input_dim)`
                torch.Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output frames.
        r   r	   )r   r0   rV   )r+   r/   r   Zencoder_padding_maskr1   layerr   r   r   r2     s
    
zConformer.forward)r   FF)r3   r4   r5   r6   r   r7   r8   r"   r   r9   r   r2   r:   r   r   r,   r   r      s   $   )typingr   r   r   __all__r9   r   r$   Moduler   r;   r=   r   r   r   r   r   <module>   s   	I]