U
    ,:%eb                     @   s|  d dl Z d dlZd dlmZmZmZmZ d dlZd dlmZ d dl	m
Z G dd dejjZG dd dejjZG d	d
 d
ejjZG dd dejjZG dd dejjZG dd dejjZG dd dejZG dd dejZejeeejdddZdd Zd(ejeeeejdddZd)ejeeeejddd Zee ed!d"d#Zee ed!d$d%Zee ed!d&d'ZdS )*    N)AnyDictListOptional)nn)
functionalc                       sV   e Zd ZdZdeeeed fddZee	j
ddd	Ze	j
e	j
d
ddZ  ZS )_ScaledEmbeddingaF  Make continuous embeddings and boost learning rate

    Args:
        num_embeddings (int): number of embeddings
        embedding_dim (int): embedding dimensions
        scale (float, optional): amount to scale learning rate (Default: 10.0)
        smooth (bool, optional): choose to apply smoothing (Default: ``False``)
          $@F)num_embeddingsembedding_dimscalesmoothc                    s   t    t||| _|rftj| jjjdd}|t	d|d 
 d d d f  }|| jjjd d < | jj j|  _|| _d S )Nr   dim   )super__init__r   Z	Embedding	embeddingtorchZcumsumweightdataarangesqrtr   )selfr
   r   r   r   r   	__class__ Y/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/torchaudio/models/_hdemucs.pyr   -   s    
$z_ScaledEmbedding.__init__)returnc                 C   s   | j j| j S N)r   r   r   )r   r   r   r   r   8   s    z_ScaledEmbedding.weightxr   c                 C   s   |  || j }|S )zForward pass for embedding with scale.
        Args:
            x (torch.Tensor): input tensor of shape `(num_embeddings)`

        Returns:
            (Tensor):
                Embedding output of shape `(num_embeddings, embedding_dim)`
        )r   r   )r   r!   outr   r   r   forward<   s    	z_ScaledEmbedding.forward)r	   F)__name__
__module____qualname____doc__intfloatboolr   propertyr   Tensorr   r#   __classcell__r   r   r   r   r   #   s
   	r   c                       sf   e Zd ZdZdeeeeeeeeeeeee	f  ed	 fd
dZ
dejeej ejdddZ  ZS )
_HEncLayerat  Encoder layer. This used both by the time and the frequency branch.
    Args:
        chin (int): number of input channels.
        chout (int): number of output channels.
        kernel_size (int, optional): Kernel size for encoder (Default: 8)
        stride (int, optional): Stride for encoder layer (Default: 4)
        norm_groups (int, optional): number of groups for group norm. (Default: 4)
        empty (bool, optional): used to make a layer with just the first conv. this is used
            before merging the time and freq. branches. (Default: ``False``)
        freq (bool, optional): boolean for whether conv layer is for frequency domain (Default: ``True``)
        norm_type (string, optional): Norm type, either ``group_norm `` or ``none`` (Default: ``group_norm``)
        context (int, optional): context size for the 1x1 conv. (Default: 0)
        dconv_kw (Dict[str, Any] or None, optional): dictionary of kwargs for the DConv class. (Default: ``None``)
        pad (bool, optional): true to pad the input. Padding is done so that the output size is
            always the input size / stride. (Default: ``True``)
          FT
group_normr   N)chinchoutkernel_sizestridenorm_groupsemptyfreq	norm_typecontextdconv_kwpadc                    s  t    |
d kri }
dd }|dkr2 fdd}|r>|d nd}tj}|| _|| _|| _|| _|| _|r|dg}|dg}|dg}tj	}||||||| _
||| _| jrt | _t | _t | _n:||d| dd|	  d|	| _|d| | _t|f|
| _d S )	Nc                 S   s   t  S r   r   Identitydr   r   r   <lambda>m       z%_HEncLayer.__init__.<locals>.<lambda>r1   c                    s   t  | S r   r   Z	GroupNormr?   r6   r   r   rA   o   rB   r0   r   r      )r   r   r   Conv1dr8   r4   r5   r7   r<   Conv2dconvnorm1r>   rewritenorm2dconv_DConv)r   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   norm_fnZpad_valklassr   rD   r   r   \   s6    



z_HEncLayer.__init__)r!   injectr   c           
      C   sj  | j s.| dkr.|j\}}}}||d|}| j sh|jd }|| j dksht|d| j|| j  f}| |}| jr||S |dk	r|jd |jd krt	d| dkr| dkr|dddddf }|| }t
| |}| j r>|j\}}}}|ddddd||}| |}|||||dddd}n
| |}| | |}	tj|	dd	}	|	S )
a]  Forward pass for encoding layer.

        Size depends on whether frequency or time

        Args:
            x (torch.Tensor): tensor input of shape `(B, C, F, T)` for frequency and shape
                `(B, C, T)` for time
            inject (torch.Tensor, optional): on last layer, combine frequency and time branches through inject param,
                same shape as x (default: ``None``)

        Returns:
            Tensor
                output tensor after encoder layer of shape `(B, C, F / stride, T)` for frequency
                    and shape `(B, C, ceil(T / stride))` for time
        r0   r   NzInjection shapes do not align   rE   r   r   )r8   r   shapeviewr5   Fr<   rH   r7   
ValueErrorgelurI   permutereshaperL   rK   rJ   glu)
r   r!   rP   BCFrTleyzr   r   r   r#      s4    



z_HEncLayer.forward)	r/   r0   r0   FTr1   r   NT)Nr$   r%   r&   r'   r(   r*   strr   r   r   r   r   r,   r#   r-   r   r   r   r   r.   I   s0            ,r.   c                       sb   e Zd ZdZdeeeeeeeeeeeeee	f  ed	 fd
dZ
ejeej dddZ  ZS )
_HDecLayera  Decoder layer. This used both by the time and the frequency branches.
    Args:
        chin (int): number of input channels.
        chout (int): number of output channels.
        last (bool, optional): whether current layer is final layer (Default: ``False``)
        kernel_size (int, optional): Kernel size for encoder (Default: 8)
        stride (int): Stride for encoder layer (Default: 4)
        norm_groups (int, optional): number of groups for group norm. (Default: 1)
        empty (bool, optional): used to make a layer with just the first conv. this is used
            before merging the time and freq. branches. (Default: ``False``)
        freq (bool, optional): boolean for whether conv layer is for frequency (Default: ``True``)
        norm_type (str, optional): Norm type, either ``group_norm `` or ``none`` (Default: ``group_norm``)
        context (int, optional): context size for the 1x1 conv. (Default: 1)
        dconv_kw (Dict[str, Any] or None, optional): dictionary of kwargs for the DConv class. (Default: ``None``)
        pad (bool, optional): true to pad the input. Padding is done so that the output size is
            always the input size / stride. (Default: ``True``)
    Fr/   r0   r   Tr1   N)r2   r3   lastr4   r5   r6   r7   r8   r9   r:   r;   r<   c                    s  t    |d kri }dd }|	dkr2 fdd}|r\|| d dkrNtd|| d }nd}|| _|| _|| _|| _|| _|| _|| _	t
j}t
j}|r|dg}|dg}t
j}t
j}|||||| _||| _| jrt
 | _t
 | _n,||d| dd|
  d|
| _|d| | _d S )	Nc                 S   s   t  S r   r=   r?   r   r   r   rA      rB   z%_HDecLayer.__init__.<locals>.<lambda>r1   c                    s   t  | S r   rC   r?   rD   r   r   rA      rB   rE   r   z#Kernel size and stride do not alignr   )r   r   rV   r<   re   r8   r2   r7   r5   r4   r   rF   ConvTranspose1drG   ConvTranspose2dconv_trrK   r>   rJ   rI   )r   r2   r3   re   r4   r5   r6   r7   r8   r9   r:   r;   r<   rN   rO   Zklass_trr   rD   r   r      s@    


z_HDecLayer.__init__)r!   skipc           	      C   s   | j r0| dkr0|j\}}}||| jd|}| jsZ|| }tj| | 	|dd}n|}|dk	rnt
d| | |}| j r| jr|d| j| j ddf }n.|d| j| j| f }|jd |krt
d| jst|}||fS )	a,  Forward pass for decoding layer.

        Size depends on whether frequency or time

        Args:
            x (torch.Tensor): tensor input of shape `(B, C, F, T)` for frequency and shape
                `(B, C, T)` for time
            skip (torch.Tensor, optional): on first layer, separate frequency and time branches using param
                (default: ``None``)
            length (int): Size of tensor for output

        Returns:
            (Tensor, Tensor):
                Tensor
                    output tensor after decoder layer of shape `(B, C, F * stride, T)` for frequency domain except last
                        frequency layer shape is `(B, C, kernel_size, T)`. Shape is `(B, C, stride * T)`
                        for time domain.
                Tensor
                    contains the output just before final transposed convolution, which is used when the
                        freq. and time branch separate. Otherwise, does not matter. Shape is
                        `(B, C, F, T)` for frequency and `(B, C, T)` for time.
        rR   rQ   r   r   Nz%Skip must be none when empty is true..z'Last index of z must be equal to length)r8   r   rS   rT   r2   r7   rU   rZ   rI   rJ   rV   rK   rh   r<   re   rW   )	r   r!   ri   lengthr[   r\   r^   r`   ra   r   r   r   r#      s&    
z_HDecLayer.forward)
Fr/   r0   r   FTr1   r   NTrb   r   r   r   r   rd      s4             2rd   c                       s   e Zd ZdZd"ee eeeeeeeeeeeeeeeeeeeed fddZ	dd Z
d#ddZd$ejeeeedddZdd Zdd Zejdd d!Z  ZS )%HDemucsa#
  Hybrid Demucs model from
    *Hybrid Spectrogram and Waveform Source Separation* :cite:`defossez2021hybrid`.

    See Also:
        * :class:`torchaudio.pipelines.SourceSeparationBundle`: Source separation pipeline with pre-trained models.

    Args:
        sources (List[str]): list of source names. List can contain the following source
            options: [``"bass"``, ``"drums"``, ``"other"``, ``"mixture"``, ``"vocals"``].
        audio_channels (int, optional): input/output audio channels. (Default: 2)
        channels (int, optional): initial number of hidden channels. (Default: 48)
        growth (int, optional): increase the number of hidden channels by this factor at each layer. (Default: 2)
        nfft (int, optional): number of fft bins. Note that changing this requires careful computation of
            various shape parameters and will not work out of the box for hybrid models. (Default: 4096)
        depth (int, optional): number of layers in encoder and decoder (Default: 6)
        freq_emb (float, optional): add frequency embedding after the first frequency layer if > 0,
            the actual value controls the weight of the embedding. (Default: 0.2)
        emb_scale (int, optional): equivalent to scaling the embedding learning rate (Default: 10)
        emb_smooth (bool, optional): initialize the embedding with a smooth one (with respect to frequencies).
            (Default: ``True``)
        kernel_size (int, optional): kernel_size for encoder and decoder layers. (Default: 8)
        time_stride (int, optional): stride for the final time layer, after the merge. (Default: 2)
        stride (int, optional): stride for encoder and decoder layers. (Default: 4)
        context (int, optional): context for 1x1 conv in the decoder. (Default: 4)
        context_enc (int, optional): context for 1x1 conv in the encoder. (Default: 0)
        norm_starts (int, optional): layer at which group norm starts being used.
            decoder layers are numbered in reverse order. (Default: 4)
        norm_groups (int, optional): number of groups for group norm. (Default: 4)
        dconv_depth (int, optional): depth of residual DConv branch. (Default: 2)
        dconv_comp (int, optional): compression of DConv branch. (Default: 4)
        dconv_attn (int, optional): adds attention layers in DConv branch starting at this layer. (Default: 4)
        dconv_lstm (int, optional): adds a LSTM layer in DConv branch starting at this layer. (Default: 4)
        dconv_init (float, optional): initial scale for the DConv branch LayerScale. (Default: 1e-4)
    rE   0         皙?
   Tr/   r0   r   r   -C6?)sourcesaudio_channelschannelsgrowthnfftdepthfreq_emb	emb_scale
emb_smoothr4   time_strider5   r:   context_encnorm_startsr6   dconv_depth
dconv_comp
dconv_attn
dconv_lstm
dconv_initc           +         s  t    || _|| _|| _|| _|
| _|| _|| _|| _	| jd | _
d | _t | _t | _t | _t | _|}|d }|}|}| jd }t| jD ],}||k}||k}||krdnd}|dk}|} |
}!|s|dkrtd|d }!|} d}"d}#|r||
kr|}!d}"d}#|!| ||"|||||||d	d
}$t|$}%d|%d< |
|%d< ||%d< d|%d< t|$}&|#rzt||}|}t||fd|i|$}'|r|#dkr|dkrd|%d< d|%d< t||f||#d|%}(| j|( | j|' |dkr| jt| j }|d }t||f|dk|d|&})|rZt||f|#|dk|d|%}*| jd|* | jd|) |}|}t|| }t|| }|r||
krd}n|| }|dkr|rt|||	|d| _|| _qt|  d S )Nr0   rE   r1   noner   z$When freq is false, freqs must be 1.TF)lstmattnrw   compressinit)r4   r5   r8   r<   r9   r6   r;   r   r8   r4   r5   r<   r:      )r:   r7   )re   r:   )r7   re   r:   )r   r   )r   r   rw   rv   rs   rr   r4   r:   r5   rt   
hop_lengthrx   r   
ModuleListfreq_encoderfreq_decodertime_encodertime_decoderrangerV   dictmaxr.   appendlenrd   insertr(   r   freq_emb_scale_rescale_module)+r   rr   rs   rt   ru   rv   rw   rx   ry   rz   r4   r{   r5   r:   r|   r}   r6   r~   r   r   r   r   r2   Zchin_zr3   Zchout_zfreqsindexr   r   r9   r8   ZstriZkerr<   Z	last_freqkwZkwtZkw_decenctencdectdecr   r   r   r   Q  s    








zHDemucs.__init__c                 C   s   | j }| j}|}||d kr$tdtt|jd | }|d d }| j|||||  |jd  dd}t|||dd dd d f }|jd |d krtd	|ddd| f }|S )
Nr0   zHop length must be nfft // 4rQ   rE   rR   reflect)mode.zESpectrogram's last dimension must be 4 + input size divided by stride)	r   rv   rV   r(   mathceilrS   _pad1d_spectro)r   r!   hlrv   Zx0r_   r<   ra   r   r   r   _spec  s    	$zHDemucs._specNc                 C   sz   | j }t|ddddg}t|ddg}|d d }|tt||  d|  }t|||d}|d||| f }|S )Nr   r   rE   rR   )rj   .)r   rU   r<   r(   r   r   	_ispectro)r   ra   rj   r   r<   r_   r!   r   r   r   _ispec  s    zHDemucs._ispeczero        )r!   padding_leftpadding_rightr   valuec                 C   sP   |j d }|dkr<t||}||kr<t|d|| d f}t|||f||S )zWrapper around F.pad, in order for reflect padding when num_frames is shorter than max_pad.
        Add extra zero padding around in order for padding to not break.rQ   r   r   r   )rS   r   rU   r<   )r   r!   r   r   r   r   rj   Zmax_padr   r   r   r     s    

zHDemucs._pad1dc                 C   s>   |j \}}}}t|ddddd}|||d ||}|S )Nr   r   r0   rE   rR   )rS   r   Zview_as_realrX   rY   )r   ra   r[   r\   r]   r^   mr   r   r   
_magnitude  s    zHDemucs._magnitudec                 C   sF   |j \}}}}}|||dd||dddddd}t| }|S )NrQ   rE   r   r   r0      rR   )rS   rT   rX   r   Zview_as_complex
contiguous)r   r   r[   Sr\   r]   r^   r"   r   r   r   _mask  s    $zHDemucs._mask)inputc           "      C   s  |j dkrtd|j |jd | jkr@td|jd  d|}|jd }| |}| |}|}|j\}}}}	|jddd	}
|jddd	}||
 d
|  }|}|jddd	}|jddd	}|| d
|  }g }g }g }g }t| j	D ]\}}|
|jd  d}|t| jk rP|
|jd  | j| }||}|jsL|
| n|}|||}|dkr| jdk	rtj|jd |jd}| | ddddddf |}|| j|  }|
| qt|}t|}t| jD ]\}}|d}||||d\}}| jt| j }||kr| j||  }|d}|jr|jd dkrhtd|j |dddddf }||d|\}}n|d}||||\}}qt|dkrtdt|dkrtdt|dkrtdt| j} ||| d||	}||dddf  |
dddf  }| |}!| |!|}||| d|}||dddf  |dddf  }|| }|S )a  HDemucs forward call

        Args:
            input (torch.Tensor): input mixed tensor of shape `(batch_size, channel, num_frames)`

        Returns:
            Tensor
                output tensor split into sources of shape `(batch_size, num_sources, channel, num_frames)`
        rR   zDExpected 3D tensor with dimensions (batch, channel, frames). Found: r   zZThe channel dimension of input Tensor must match `audio_channels` of HDemucs model. Found:.rQ   )r   rE   rR   T)r   Zkeepdimgh㈵>)r   rE   Nr   )devicerE   z0If tdec empty is True, pre shape does not match zsaved is not emptyzlengths_t is not emptyzsaved_t is not empty)ndimrV   rS   rs   r   r   meanstd	enumerater   r   r   r   r7   rx   r   r   r   tZ	expand_asr   Z
zeros_liker   poprw   r   AssertionErrorrr   rT   r   r   )"r   r   r!   rj   ra   magr[   r\   ZFqr^   r   r   ZxtZmeantZstdtZsavedZsaved_tlengthsZ	lengths_tidxencoderP   r   ZfrsZembdecoderi   preoffsetr   Zlength_t_r   Zzoutr   r   r   r#     s    





(






$
$zHDemucs.forward)rE   rl   rE   rm   rn   ro   rp   Tr/   rE   r0   r   r   r0   r0   rE   r0   r0   r0   rq   )N)r   r   )r$   r%   r&   r'   r   rc   r(   r)   r*   r   r   r   r   r,   r   r   r   r#   r-   r   r   r   r   rk   -  sf   &                     


rk   c                       sB   e Zd ZdZdeeeeeeeeeed
 fd	d
Zdd Z	  Z
S )rM   a  
    New residual branches in each encoder layer.
    This alternates dilated convolutions, potentially with LSTMs and attention.
    Also before entering each residual branch, dimension is projected on a smaller subspace,
    e.g. of dim `channels // compress`.

    Args:
        channels (int): input/output channels for residual branch.
        compress (float, optional): amount of channel compression inside the branch. (default: 4)
        depth (int, optional): number of layers in the residual branch. Each layer has its own
            projection, and potentially LSTM and attention.(default: 2)
        init (float, optional): initial scale for LayerNorm. (default: 1e-4)
        norm_type (bool, optional): Norm type, either ``group_norm `` or ``none`` (Default: ``group_norm``)
        attn (bool, optional): use LocalAttention. (Default: ``False``)
        heads (int, optional): number of heads for the LocalAttention.  (default: 4)
        ndecay (int, optional): number of decay controls in the LocalAttention. (default: 4)
        lstm (bool, optional): use LSTM. (Default: ``False``)
        kernel_size (int, optional): kernel size for the (dilated) convolutions. (default: 3)
    r0   rE   rq   r1   FrR   )
rt   r   rw   r   r9   r   headsndecayr   r4   c              
      s<  t    |
d dkrtd|| _|| _t|| _|dk}dd }|dkrTdd }t|| }tj	}t
g | _t| jD ]}|rtd|nd}||
d  }tj|||
||d	||| t|d| d|d| tdt||g}|r|d
t|||d |	r |d
t|ddd tj| }| j| q|d S )NrE   r   z(Kernel size should not be divisible by 2c                 S   s   t  S r   r=   r?   r   r   r   rA     rB   z!_DConv.__init__.<locals>.<lambda>r1   c                 S   s   t d| S )Nr   rC   r?   r   r   r   rA     rB   r   )dilationpaddingrR   )r   r   Tlayersri   )r   r   rV   rt   r   absrw   r(   r   ZGELUr   r   r   powrF   ZGLU_LayerScaler   _LocalState_BLSTMZ
Sequentialr   )r   rt   r   rw   r   r9   r   r   r   r   r4   ZdilaterN   ZhiddenZactr@   r   r   modslayerr   r   r   r     s<    


	
z_DConv.__init__c                 C   s   | j D ]}||| }q|S )zDConv forward call

        Args:
            x (torch.Tensor): input tensor for convolution

        Returns:
            Tensor
                Output after being run through layers.
        )r   )r   r!   r   r   r   r   r#     s    

z_DConv.forward)	r0   rE   rq   r1   Fr0   r0   FrR   )r$   r%   r&   r'   r(   r)   rc   r*   r   r#   r-   r   r   r   r   rM   }  s.            3rM   c                       s>   e Zd ZdZd
eed fddZejejddd	Z	  Z
S )r   ae  
    BiLSTM with same hidden units as input dim.
    If `max_steps` is not None, input will be splitting in overlapping
    chunks and the LSTM applied separately on each chunk.
    Args:
        dim (int): dimensions at LSTM layer.
        layers (int, optional): number of LSTM layers. (default: 1)
        skip (bool, optional): (default: ``False``)
    r   Fr   c                    s@   t    d| _tjd|||d| _td| || _|| _d S )N   T)bidirectionalZ
num_layersZhidden_sizeZ
input_sizerE   )	r   r   	max_stepsr   ZLSTMr   ZLinearlinearri   )r   r   r   ri   r   r   r   r     s
    
z_BLSTM.__init__r    c              	   C   s  |j \}}}|}d}d}d}d}	| jdk	rv|| jkrv| j}|d }t|||}
|
j d }	d}|
ddddd||}|ddd}| |d }| |}|ddd}|rg }||d||}
|d }t|	D ]}|dkr||
dd|ddd| f  q||	d kr:||
dd|dd|df  q||
dd|dd|| f  qt	
|d}|d	d|f }|}| jr|| }|S )
a  BLSTM forward call

        Args:
            x (torch.Tensor): input tensor for BLSTM shape is `(batch_size, dim, time_steps)`

        Returns:
            Tensor
                Output after being run through bidirectional LSTM. Shape is `(batch_size, dim, time_steps)`
        Fr   NrE   Tr   rR   rQ   .)rS   r   _unfoldrX   rY   r   r   r   r   r   catri   )r   r!   r[   r\   r^   r`   Zframedwidthr5   Znframesframesr"   limitkr   r   r   r#     sB    



&$&z_BLSTM.forward)r   F)r$   r%   r&   r'   r(   r*   r   r   r,   r#   r-   r   r   r   r   r     s   
r   c                       s@   e Zd ZdZd	eeed fddZejejdddZ  Z	S )
r   a   Local state allows to have attention based only on data (no positional embedding),
    but while setting a constraint on the time window (e.g. decaying penalty term).
    Also a failed experiments with trying to provide some frequency based attention.
    r0   )rt   r   r   c                    s   t t|   || dkr"td|| _|| _t||d| _t||d| _	t||d| _
t||| d| _|r| jj jd9  _| jjdkrtdd| jjjdd< t||d  |d| _dS )z
        Args:
            channels (int): Size of Conv1d layers.
            heads (int, optional):  (default: 4)
            ndecay (int, optional): (default: 4)
        r   z$Channels must be divisible by heads.r   g{Gz?Nzbias must not be None.r   )r   r   r   rV   r   r   r   rF   contentquerykeyquery_decayr   r   biasproj)r   rt   r   r   r   r   r   r     s    z_LocalState.__init__r    c                 C   s|  |j \}}}| j}tj||j|jd}|dddf |dddf  }| |||d|}| |||d|}	t	d|	|}
|
t
|	j d  }
| jrtjd| jd |j|jd}| |||d|}t|d }|ddd |  t
| j }|
t	d||7 }
|
tj||
jtjdd tj|
dd	}| |||d|}t	d
||}||d|}|| | S )zLocalState forward call

        Args:
            x (torch.Tensor): input tensor for LocalState

        Returns:
            Tensor
                Output after being run through LocalState layer.
        )r   dtypeNrQ   zbhct,bhcs->bhtsrE   r   zfts,bhfs->bhtsir   zbhts,bhct->bhcs)rS   r   r   r   r   r   r   rT   r   Zeinsumr   r   r   r   Zsigmoidr   Zmasked_fill_eyer*   Zsoftmaxr   rY   r   )r   r!   r[   r\   r^   r   ZindexesdeltaZquerieskeysdotsZdecaysZdecay_qZdecay_kernelweightsr   resultr   r   r   r#   6  s(    
 $z_LocalState.forward)r0   r0   )
r$   r%   r&   r'   r(   r   r   r,   r#   r-   r   r   r   r   r     s   r   c                       s>   e Zd ZdZd	eed fddZejejdddZ	  Z
S )
r   zLayer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
    This rescales diagonally residual outputs close to 0 initially, then learnt.
    r   )rt   r   c                    s4   t    ttj|dd| _|| jjdd< dS )z
        Args:
            channels (int): Size of  rescaling
            init (float, optional): Scale to default to (default: 0)
        T)Zrequires_gradN)r   r   r   	Parameterr   Zzerosr   r   )r   rt   r   r   r   r   r   a  s    
z_LayerScale.__init__r    c                 C   s   | j dddf | S )zLayerScale forward call

        Args:
            x (torch.Tensor): input tensor for LayerScale

        Returns:
            Tensor
                Output after rescaling tensor.
        N)r   )r   r!   r   r   r   r#   k  s    
z_LayerScale.forward)r   )r$   r%   r&   r'   r(   r)   r   r   r,   r#   r-   r   r   r   r   r   \  s   
r   )ar4   r5   r   c                    s   t  jdd }t jd }t|| }|d | | }tj d|| gd  fddt  D }|d dkrt	d|dd |dg }|
| |
|  ||S )	zGiven input of size [*OT, T], output Tensor of size [*OT, F, K]
    with K the kernel size, by extracting frames with the given stride.
    This will pad the input so that `F = ceil(T / K)`.
    see https://github.com/pytorch/pytorch/issues/60466
    NrQ   r   r   )r   r<   c                    s   g | ]}  |qS r   )r5   ).0r   r   r   r   
<listcomp>  s     z_unfold.<locals>.<listcomp>zData should be contiguous.)listrS   r(   r   r   rU   r<   r   r   rV   r   Z
as_strided)r   r4   r5   rS   rj   Zn_framesZ
tgt_lengthstridesr   r   r   r   x  s    

r   c                 C   sp   |   D ]b}t|tjtjtjtjfr|j 	 }|d d }|j j
|  _
|jdk	r|j j
|  _
qdS )zI
    Rescales initial weight scale for all models within the module.
    g?g      ?N)modules
isinstancer   rF   rf   rG   rg   r   r   detachr   r   )modulesubr   r   r   r   r   r     s    
r      )r!   n_fftr   r<   r   c           
      C   s   t | jd d }t| jd }| d|} tj| |d|  |t|| |ddddd	}|j\}}}	|||	g |	|S )NrQ   r   Tr   )window
win_length
normalizedcenterZreturn_complexZpad_mode)
r   rS   r(   rY   r   Zstfthann_windowtoextendrT   )
r!   r   r   r<   otherrj   ra   r   r   framer   r   r   r     s"    
r   )ra   r   rj   r<   r   c              
   C   s   t | jd d }t| jd }t| jd }d| d }| d||} |d|  }tj| ||t|| j|d|dd}	|	j\}
}|	| |	|S )Nr   rQ   rE   r   T)r   r   r   rj   r   )
r   rS   r(   rT   r   Zistftr   r   realr   )ra   r   rj   r<   r   r   r   r   r   r!   r   r   r   r   r     s&    


r   )rr   r   c                 C   s   t | dddS )zBuilds low nfft (1024) version of :class:`HDemucs`, suitable for sample rates around 8 kHz.

    Args:
        sources (List[str]): See :py:func:`HDemucs`.

    Returns:
        HDemucs:
            HDemucs model.
    i   r   rr   rv   rw   rk   rr   r   r   r   hdemucs_low  s    r  c                 C   s   t | dddS )a  Builds medium nfft (2048) version of :class:`HDemucs`, suitable for sample rates of 16-32 kHz.

    .. note::

        Medium HDemucs has not been tested against the original Hybrid Demucs as this nfft and depth configuration is
        not compatible with the original implementation in https://github.com/facebookresearch/demucs

    Args:
        sources (List[str]): See :py:func:`HDemucs`.

    Returns:
        HDemucs:
            HDemucs model.
    r   rn   r  r  r  r   r   r   hdemucs_medium  s    r  c                 C   s   t | dddS )zBuilds medium nfft (4096) version of :class:`HDemucs`, suitable for sample rates of 44.1-48 kHz.

    Args:
        sources (List[str]): See :py:func:`HDemucs`.

    Returns:
        HDemucs:
            HDemucs model.
    rm   rn   r  r  r  r   r   r   hdemucs_high  s    r  )r   r   r   )r   r   r   )r   typingtpr   r   r   r   r   r   Ztorch.nnr   rU   Moduler   r.   rd   rk   rM   r   r   r   r,   r(   r   r   r   r   rc   r  r  r  r   r   r   r   <module>   s,   &nv  RWCE