U
    ×9%eÿ ã                   @   s~  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z
ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZ ddlmZ e e¡Z dZ!dgZ"eG dd„ deƒƒZ#eG dd„ deƒƒZ$dGej%ej&ee' dœdd„Z(ej)j*dd„ ƒZ+dHdd„Z,dd„ Z-G d d!„ d!ejj.ƒZ/G d"d#„ d#ej.ƒZ0G d$d%„ d%ej.ƒZ1G d&d'„ d'ej.ƒZ2G d(d)„ d)ej.ƒZ3G d*d+„ d+ej.ƒZ4G d,d-„ d-ej.ƒZ5G d.d/„ d/ej.ƒZ6G d0d1„ d1ej.ƒZ7G d2d3„ d3ej.ƒZ8G d4d5„ d5ej.ƒZ9G d6d7„ d7ej.ƒZ:G d8d9„ d9ej.ƒZ;G d:d;„ d;ej.ƒZ<G d<d=„ d=ej.ƒZ=G d>d?„ d?ej.ƒZ>G d@dA„ dAeƒZ?dBZ@dCZAedDe@ƒG dEdF„ dFe?ƒƒZBdS )Iz PyTorch VITS model.é    N)Ú	dataclass)ÚAnyÚOptionalÚTupleÚUnion)Únné   )ÚACT2FN)Úis_deepspeed_zero3_enabled)ÚBaseModelOutputÚModelOutput)ÚPreTrainedModel)Úadd_start_docstringsÚ%add_start_docstrings_to_model_forwardÚloggingÚreplace_return_docstringsé   )Ú
VitsConfigr   zfacebook/mms-tts-engc                   @   sp   e Zd ZU dZdZejed< dZejed< dZ	e
eej  ed< dZe
eej  ed< dZe
eej  ed< dS )ÚVitsModelOutputaC  
    Describes the outputs for the VITS model, with potential hidden states and attentions.

    Args:
        waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            The final audio waveform predicted by the model.
        sequence_lengths  (`torch.FloatTensor` of shape `(batch_size,)`):
            The length in samples of each element in the `waveform` batch.
        spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
            The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi
            GAN decoder model to obtain the final audio waveform.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attention weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    NÚwaveformÚsequence_lengthsÚspectrogramÚhidden_statesÚ
attentions)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   ÚtorchÚFloatTensorÚ__annotations__r   r   r   r   r   r   © r!   r!   úe/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/vits/modeling_vits.pyr   3   s   
r   c                   @   sh   e Zd ZU dZdZejed< dZejed< dZ	ejed< dZ
eeej  ed< dZeeej  ed< dS )ÚVitsTextEncoderOutputaa  
    Describes the outputs for the VITS text encoder model, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            The predicted mean values of the prior distribution for the latent text variables.
        prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            The predicted log-variance values of the prior distribution for the latent text variables.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attention weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    NÚlast_hidden_stateÚprior_meansÚprior_log_variancesr   r   )r   r   r   r   r$   r   r   r    r%   r&   r   r   r   r   r!   r!   r!   r"   r#   T   s   
r#   )ÚmaskÚdtypeÚtgt_lenc                 C   sj   |   ¡ \}}|dk	r|n|}| dd…dddd…f  |d||¡ |¡}d| }| | tj¡t |¡j¡S )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    Nr   ç      ð?)ÚsizeÚexpandÚtoZmasked_fillr   ÚboolZfinfoÚmin)r'   r(   r)   ÚbszÚsrc_lenZexpanded_maskZinverted_maskr!   r!   r"   Ú_expand_masku   s
    *r2   c                 C   sT   | | }t  |d d …d |…d d …f ¡}t  |d d …|d …d d …f ¡}|| }|S ©N)r   ÚtanhÚsigmoid)Zinput_aZinput_bZnum_channelsZin_actZt_actZs_actÚactsr!   r!   r"   Úfused_add_tanh_sigmoid_multiplyƒ   s
      r7   Fç      @çü©ñÒMbP?c	                 C   sÎ   | | k| |k@ }	|	 }
t  | ¡}t  | ¡}t t d| ¡d ¡}tjj|dd}||d< ||d< | |
 ||
< d||
< t| |	 ||	dd…f ||	dd…f ||	dd…f |||||d	\||	< ||	< ||fS )	aô	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Outside of the
    `tail_bound`, the transform behaves as an identity function.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`, *optional*, defaults to `False`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`, *optional* defaults to 5):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function with the `tail_bound` limits
            applied.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs` with the `tail_bound`
            limits applied.
    r   )r   r   )Úpad©.r   ©.éÿÿÿÿç        N)	ÚinputsÚunnormalized_widthsÚunnormalized_heightsÚunnormalized_derivativesÚreverseÚ
tail_boundÚmin_bin_widthÚmin_bin_heightÚmin_derivative)	r   Ú
zeros_likeÚnpÚlogÚexpr   Ú
functionalr:   Ú_rational_quadratic_spline)r?   r@   rA   rB   rC   rD   rE   rF   rG   Zinside_interval_maskZoutside_interval_maskÚoutputsÚlog_abs_detÚconstantr!   r!   r"   Ú(_unconstrained_rational_quadratic_splineŒ   s,    .

÷rQ   c	           *      C   s  |}	| }
t  | ¡|
k s&t  | ¡|	kr.tdƒ‚|jd }|| dkrXtd|› d|› ƒ‚|| dkrxtd|› d|› ƒ‚tjj|dd}|d||  |  }t j|dd}tjj	|d	d
dd}|	|
 | |
 }|
|d< |	|d< |ddd…f |ddd…f  }|tj 
|¡ }tjj|dd}|d||  |  }t j|dd}tjj	|d	d
dd}|	|
 | |
 }|
|d< |	|d< |ddd…f |ddd…f  }|rž|n|}|d  d7  < t j| d |kddd }|d }| d|¡d }| d|¡d }| d|¡d }|| }| d|¡d }| d|¡d }|ddd…f  d|¡d }| d|¡d }|| d|  }|s| | | }|d|  }||| d¡ ||   }|||  }|||  } | d¡|| d¡ d| |  |d|  d¡   }!t  |!¡dt  |¡  }"| |"fS | | }#|#| }$|||  |$ }%|| |$ }&| |# }'|& d¡d|% |'  }(|(dk ¡ svtd|(› ƒ‚d|' |& t  |(¡  })|)| | } |)d|)  }|||  }| d¡||) d¡ d| |  |d|)  d¡   }!t  |!¡dt  |¡  }"| |" fS dS )a(	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Unlike the
    function `_unconstrained_rational_quadratic_spline`, the function behaves the same across the `tail_bound`.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs`.
    z-Input to a transform is not within its domainr=   r*   zMinimal bin width z" too large for the number of bins zMinimal bin height ©Údimr   )r   r   rP   r>   )r:   ÚmodeÚvaluer;   r<   .Ngíµ ÷Æ°>).Né   é   r   zinvalid discriminant )r   r/   ÚmaxÚ
ValueErrorÚshaper   rL   ÚsoftmaxÚcumsumr:   ZsoftplusÚsumÚgatherÚpowrJ   ÚallÚRuntimeErrorÚsqrt)*r?   r@   rA   rB   rC   rD   rE   rF   rG   Úupper_boundÚlower_boundÚnum_binsÚwidthsZ	cumwidthsZderivativesZheightsZ
cumheightsZbin_locationsZbin_idxZinput_cumwidthsZinput_bin_widthsZinput_cumheightsÚdeltaZinput_deltaZinput_derivativesZinput_derivatives_plus_oneZinput_heightsZintermediate1ÚthetaZtheta_one_minus_thetaÚ	numeratorÚdenominatorrN   Zderivative_numeratorrO   Zintermediate2Zintermediate3ÚaÚbÚcZdiscriminantÚrootr!   r!   r"   rM   Ö   s–    ,
  
ÿþÿ

ÿþÿrM   c                       s6   e Zd Zeedœ‡ fdd„Zd	dd„Zdd„ Z‡  ZS )
ÚVitsWaveNet)ÚconfigÚ
num_layersc                    sD  t ƒ  ¡  |j| _|| _tj ¡ | _tj ¡ | _t 	|j
¡| _ttjjdƒrXtjjj}ntjj}|jdkr”tj |jd|j | d¡}||dd| _t|ƒD ]¢}|j| }|j| | d }tjj|jd|j |j||d}||dd}| j |¡ ||d k rd|j }	n|j}	tj |j|	d¡}
||
dd}
| j |
¡ qœd S )NÚweight_normr   rV   r   Úweight)Úname)Úin_channelsÚout_channelsÚkernel_sizeÚdilationÚpadding)ÚsuperÚ__init__Úhidden_sizerq   r   r   Ú
ModuleListÚ	in_layersÚres_skip_layersÚDropoutZwavenet_dropoutÚdropoutÚhasattrÚutilsZparametrizationsrr   Úspeaker_embedding_sizeÚConv1dÚ
cond_layerÚrangeZwavenet_dilation_rateZwavenet_kernel_sizeÚappend)Úselfrp   rq   rr   r†   Úirx   ry   Zin_layerZres_skip_channelsZres_skip_layer©Ú	__class__r!   r"   r{   _  s<    


ûzVitsWaveNet.__init__Nc                 C   s   t  |¡}t  | jg¡}|d k	r*|  |¡}t| jƒD ]â}| j| |ƒ}|d k	r‚|d | j }|d d …||d| j  …d d …f }	n
t  |¡}	t||	|d ƒ}
|  	|
¡}
| j
| |
ƒ}|| jd k r|d d …d | j…d d …f }|| | }||d d …| jd …d d …f  }q4|| }q4|| S )NrV   r   r   )r   rH   Z	IntTensorr|   r†   r‡   rq   r~   r7   r   r   )r‰   r?   Úpadding_maskÚglobal_conditioningrN   Znum_channels_tensorrŠ   r   Zcond_offsetZglobal_statesr6   Zres_skip_actsZres_actsr!   r!   r"   Úforwardˆ  s&    

&

"
zVitsWaveNet.forwardc                 C   sR   | j dkrtjj | j¡ | jD ]}tjj |¡ q | jD ]}tjj |¡ q:d S )Nr   )r„   r   r   rƒ   Úremove_weight_normr†   r~   r   ©r‰   Úlayerr!   r!   r"   r   ¥  s    


zVitsWaveNet.remove_weight_norm)N)	r   r   r   r   Úintr{   r   r   Ú__classcell__r!   r!   r‹   r"   ro   ^  s   )
ro   c                       s,   e Zd Zedœ‡ fdd„Zddd„Z‡  ZS )ÚVitsPosteriorEncoder©rp   c                    sR   t ƒ  ¡  |j| _t |j|jd¡| _t	||j
d| _t |j| jd d¡| _d S )Nr   ©rq   rV   )rz   r{   Ú	flow_sizerv   r   r…   Zspectrogram_binsr|   Úconv_prero   Z$posterior_encoder_num_wavenet_layersÚwavenetÚ	conv_proj©r‰   rp   r‹   r!   r"   r{   ¯  s
    
zVitsPosteriorEncoder.__init__Nc                 C   sf   |   |¡| }|  |||¡}|  |¡| }tj|| jdd\}}|t |¡t |¡  | }|||fS )Nr   rR   )r™   rš   r›   r   Úsplitrv   Ú
randn_likerK   )r‰   r?   r   rŽ   ÚstatsÚmeanÚ
log_stddevZsampledr!   r!   r"   r   ·  s    zVitsPosteriorEncoder.forward)N©r   r   r   r   r{   r   r”   r!   r!   r‹   r"   r•   ®  s   r•   c                       s@   e Zd Zd‡ fdd„	Zddd„Zd	d
„ Zdd„ Zdd„ Z‡  ZS )ÚHifiGanResidualBlockr   ©r   r   é   çš™™™™™¹?c                    sb   t ƒ  ¡  |ˆ_t ‡ ‡‡‡fdd„ttˆƒƒD ƒ¡ˆ_t ‡ ‡‡fdd„ttˆƒƒD ƒ¡ˆ_d S )Nc                    s2   g | ]*}t jˆ ˆ ˆd ˆ| ˆ ˆˆ| ¡d‘qS ©r   )Ústriderx   ry   ©r   r…   Úget_padding)Ú.0rŠ   ©Úchannelsrx   rw   r‰   r!   r"   Ú
<listcomp>Ç  s   	øúz1HifiGanResidualBlock.__init__.<locals>.<listcomp>c                    s*   g | ]"}t jˆ ˆ ˆd d ˆ ˆd ¡d‘qS r§   r©   ©r«   Ú_)r­   rw   r‰   r!   r"   r®   Ô  s   	ø
ú)	rz   r{   Úleaky_relu_sloper   r}   r‡   ÚlenÚconvs1Úconvs2)r‰   r­   rw   rx   r±   r‹   r¬   r"   r{   Â  s    
	
÷ÿ	
÷ÿzHifiGanResidualBlock.__init__r   c                 C   s   || | d S )NrV   r!   )r‰   rw   rx   r!   r!   r"   rª   á  s    z HifiGanResidualBlock.get_paddingc                 C   s4   | j D ]}tj |¡ q| jD ]}tj |¡ qd S r3   )r³   r   rƒ   rr   r´   r‘   r!   r!   r"   Úapply_weight_normä  s    

z&HifiGanResidualBlock.apply_weight_normc                 C   s4   | j D ]}tj |¡ q| jD ]}tj |¡ qd S r3   )r³   r   rƒ   r   r´   r‘   r!   r!   r"   r   ê  s    

z'HifiGanResidualBlock.remove_weight_normc                 C   sX   t | j| jƒD ]D\}}|}tj || j¡}||ƒ}tj || j¡}||ƒ}|| }q|S r3   )Úzipr³   r´   r   rL   Ú
leaky_relur±   )r‰   r   Zconv1Zconv2Úresidualr!   r!   r"   r   ð  s    
zHifiGanResidualBlock.forward)r   r¤   r¦   )r   )	r   r   r   r{   rª   rµ   r   r   r”   r!   r!   r‹   r"   r£   Á  s
   
r£   c                       sP   e Zd Zedœ‡ fdd„Zdd„ Zdd„ Zdeje	ej ejd	œd
d„Z
‡  ZS )ÚVitsHifiGanr–   c              
      sD  t ƒ  ¡  || _t|jƒ| _t|jƒ| _tj	|j
|jdddd| _t ¡ | _tt|j|jƒƒD ]H\}\}}| j tj|jd|  |jd|d   |||| d d¡ q^t ¡ | _tt| jƒƒD ]F}|jd|d   }t|j|jƒD ] \}}| j t||||jƒ¡ qäqÀtj	|dddddd| _|jdkr@t 	|j|jd¡| _d S )	Né   r   r   )rw   r¨   ry   rV   F)rw   r¨   ry   Úbiasr   )rz   r{   rp   r²   Zresblock_kernel_sizesÚnum_kernelsÚupsample_ratesÚnum_upsamplesr   r…   r˜   Zupsample_initial_channelr™   r}   Ú	upsamplerÚ	enumerater¶   Zupsample_kernel_sizesrˆ   ZConvTranspose1dÚ	resblocksr‡   Zresblock_dilation_sizesr£   r±   Ú	conv_postr„   Úcond)r‰   rp   rŠ   Zupsample_raterw   r­   rx   r‹   r!   r"   r{   ü  s<    
û

ûÿ

zVitsHifiGan.__init__c                 C   s0   | j D ]}tj |¡ q| jD ]}| ¡  qd S r3   )r¿   r   rƒ   rr   rÁ   rµ   r‘   r!   r!   r"   rµ      s    

zVitsHifiGan.apply_weight_normc                 C   s0   | j D ]}tj |¡ q| jD ]}| ¡  qd S r3   )r¿   r   rƒ   r   rÁ   r‘   r!   r!   r"   r   &  s    

zVitsHifiGan.remove_weight_normN)r   rŽ   Úreturnc                 C   sÀ   |   |¡}|dk	r ||  |¡ }t| jƒD ]p}tj || jj¡}| j	| |ƒ}| j
|| j  |ƒ}td| jƒD ] }|| j
|| j |  |ƒ7 }qn|| j }q*tj |¡}|  |¡}t |¡}|S )aG  
        Converts a spectrogram into a speech waveform.

        Args:
            spectrogram (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`):
                Tensor containing the spectrograms.
            global_conditioning (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_size, 1)`, *optional*):
                Tensor containing speaker embeddings, for multispeaker models.

        Returns:
            `torch.FloatTensor`: Tensor of shape shape `(batch_size, 1, num_frames)` containing the speech waveform.
        Nr   )r™   rÃ   r‡   r¾   r   rL   r·   rp   r±   r¿   rÁ   r¼   rÂ   r   r4   )r‰   r   rŽ   r   rŠ   Z	res_stateÚjr   r!   r!   r"   r   ,  s    


zVitsHifiGan.forward)N)r   r   r   r   r{   rµ   r   r   r   r   r   r”   r!   r!   r‹   r"   r¹   û  s   $ ÿ þr¹   c                       s,   e Zd Zedœ‡ fdd„Zddd„Z‡  ZS )	ÚVitsResidualCouplingLayerr–   c                    sR   t ƒ  ¡  |jd | _t | j|jd¡| _t||j	d| _
t |j| jd¡| _d S )NrV   r   r—   )rz   r{   r˜   Úhalf_channelsr   r…   r|   r™   ro   Z prior_encoder_num_wavenet_layersrš   rÂ   rœ   r‹   r!   r"   r{   P  s
    
z"VitsResidualCouplingLayer.__init__NFc                 C   sÊ   t j|| jgd dd\}}|  |¡| }|  |||¡}|  |¡| }t  |¡}	|s”||t  |	¡ |  }t j||gdd}
t  	|	ddg¡}|
|fS || t  |	 ¡ | }t j||gdd}
|
d fS d S )NrV   r   rR   )
r   r   rÇ   r™   rš   rÂ   rH   rK   Úcatr]   )r‰   r?   r   rŽ   rC   Ú
first_halfÚsecond_halfr   r    r¡   rN   Úlog_determinantr!   r!   r"   r   X  s    
z!VitsResidualCouplingLayer.forward)NFr¢   r!   r!   r‹   r"   rÆ   O  s   rÆ   c                       s,   e Zd Zedœ‡ fdd„Zddd„Z‡  ZS )	ÚVitsResidualCouplingBlockr–   c                    s8   t ƒ  ¡  t ¡ | _t|jƒD ]}| j t|ƒ¡ qd S r3   )	rz   r{   r   r}   Úflowsr‡   Zprior_encoder_num_flowsrˆ   rÆ   )r‰   rp   r°   r‹   r!   r"   r{   k  s    

z"VitsResidualCouplingBlock.__init__NFc                 C   sf   |s0| j D ]"}||||ƒ\}}t |dg¡}q
n2t| j ƒD ]&}t |dg¡}||||dd\}}q:|S )Nr   T©rC   )rÍ   r   ÚflipÚreversed)r‰   r?   r   rŽ   rC   Úflowr°   r!   r!   r"   r   q  s    
z!VitsResidualCouplingBlock.forward)NFr¢   r!   r!   r‹   r"   rÌ   j  s   rÌ   c                       s.   e Zd Zdedœ‡ fdd„Zd	dd„Z‡  ZS )
ÚVitsDilatedDepthSeparableConvr>   r–   c                    sÖ   t ƒ  ¡  |j}|j}|j| _t |¡| _t 	¡ | _
t 	¡ | _t 	¡ | _t 	¡ | _t| jƒD ]t}|| }|| | d }| j
 tj||||||d¡ | j t ||d¡¡ | j t |¡¡ | j t |¡¡ q\d S )NrV   )ru   rv   rw   Úgroupsrx   ry   r   )rz   r{   Úduration_predictor_kernel_sizer|   Zdepth_separable_num_layersrq   r   r€   r   r}   Úconvs_dilatedÚconvs_pointwiseÚnorms_1Únorms_2r‡   rˆ   r…   Ú	LayerNorm)r‰   rp   Údropout_raterw   r­   rŠ   rx   ry   r‹   r!   r"   r{   ~  s2    




úÿ
z&VitsDilatedDepthSeparableConv.__init__Nc                 C   s®   |d k	r|| }t | jƒD ]Š}| j| || ƒ}| j| | dd¡ƒ dd¡}tj |¡}| j| |ƒ}| j	| | dd¡ƒ dd¡}tj |¡}|  
|¡}|| }q|| S ©Nr   r=   )r‡   rq   rÕ   r×   Ú	transposer   rL   ZgelurÖ   rØ   r   )r‰   r?   r   rŽ   rŠ   r   r!   r!   r"   r   š  s    

z%VitsDilatedDepthSeparableConv.forward)r>   )Nr¢   r!   r!   r‹   r"   rÒ   }  s   rÒ   c                       s,   e Zd Zedœ‡ fdd„Zddd„Z‡  ZS )	ÚVitsConvFlowr–   c                    sr   t ƒ  ¡  |j| _|jd | _|j| _|j| _	t
 | j| jd¡| _t|ƒ| _t
 | j| j| jd d  d¡| _d S )NrV   r   r   )rz   r{   r|   Úfilter_channelsÚdepth_separable_channelsrÇ   Zduration_predictor_flow_binsre   Zduration_predictor_tail_boundrD   r   r…   r™   rÒ   Úconv_ddsr›   rœ   r‹   r!   r"   r{   ¬  s    

zVitsConvFlow.__init__NFc                 C   s   t j|| jgd dd\}}|  |¡}|  |||¡}|  |¡| }|j\}}	}
| ||	d|
¡ dddd¡}|dd | j	…f t
 | j¡ }|d| j	d| j	 …f t
 | j¡ }|dd| j	 d …f }t|||||| jd\}}t j||gdd| }|st  || ddg¡}||fS |d fS d S )	NrV   r   rR   r=   r   r   .)rC   rD   )r   r   rÇ   r™   rà   r›   rZ   ÚreshapeÚpermutere   Úmathrb   rÞ   rQ   rD   rÈ   r]   )r‰   r?   r   rŽ   rC   rÉ   rÊ   r   Ú
batch_sizer­   Úlengthr@   rA   rB   rO   rN   rË   r!   r!   r"   r   ·  s,    
$ú
	zVitsConvFlow.forward)NFr¢   r!   r!   r‹   r"   rÝ   «  s   rÝ   c                       s,   e Zd Zedœ‡ fdd„Zddd„Z‡  ZS )	ÚVitsElementwiseAffiner–   c                    sB   t ƒ  ¡  |j| _t t | jd¡¡| _t t | jd¡¡| _	d S ©Nr   )
rz   r{   rß   r­   r   Ú	Parameterr   ZzerosÚ	translateÚ	log_scalerœ   r‹   r!   r"   r{   ×  s    
zVitsElementwiseAffine.__init__NFc                 C   sh   |s@| j t | j¡|  }|| }t | j| ddg¡}||fS || j  t | j ¡ | }|d fS d S ©Nr   rV   )ré   r   rK   rê   r]   )r‰   r?   r   rŽ   rC   rN   rË   r!   r!   r"   r   Ý  s    zVitsElementwiseAffine.forward)NFr¢   r!   r!   r‹   r"   ræ   Ö  s   ræ   c                       s&   e Zd Z‡ fdd„Zddd„Z‡  ZS )	ÚVitsStochasticDurationPredictorc                    s  t ƒ  ¡  |j}|j}t ||d¡| _t ||d¡| _t||j	d| _
|dkr^t ||d¡| _t ¡ | _| j t|ƒ¡ t|jƒD ]}| j t|ƒ¡ q‚t d|d¡| _t ||d¡| _t||j	d| _t ¡ | _| j t|ƒ¡ t|jƒD ]}| j t|ƒ¡ qìd S )Nr   )rÚ   r   )rz   r{   r„   r|   r   r…   r™   r›   rÒ   Úduration_predictor_dropoutrà   rÃ   r}   rÍ   rˆ   ræ   r‡   Zduration_predictor_num_flowsrÝ   Úpost_conv_preÚpost_conv_projÚpost_conv_ddsÚ
post_flows)r‰   rp   Ú	embed_dimrÞ   r°   r‹   r!   r"   r{   é  s2    
þ
þ
z(VitsStochasticDurationPredictor.__init__NFr*   c                 C   s²  t  |¡}|  |¡}|d k	r4t  |¡}||  |¡ }|  ||¡}|  |¡| }|s|  |¡}|  ||¡}|  |¡| }t  	| 
d¡d| 
d¡¡j|j|jd| }d}	|}
| jD ]0}||
||| d\}
}t  |
dg¡}
|	|7 }	q²t j|
ddgdd\}}|	t  tj |¡tj | ¡ | ddg¡7 }	t  dt dtj ¡|d   | ddg¡|	 }|t  |¡ | }t  t  |d¡¡| }t  | ddg¡}t j||gdd}| jD ].}||||d\}}t  |dg¡}||7 }q¬t  d	t dtj ¡|d   | ddg¡| }|| S tt| jƒƒ}|d d
… |d g }t  	| 
d¡d| 
d¡¡j|j|jd| }|D ](}t  |dg¡}||||dd\}}qht j|ddgdd\}}|S d S )Nr   rV   )Údevicer(   )rŽ   r   rR   ç      à¿gñhãˆµøä>g      à?éþÿÿÿr=   T)rŽ   rC   )r   Údetachr™   rÃ   rà   r›   rî   rð   rï   Úrandnr+   r-   ró   r(   rñ   rÏ   r   r]   r   rL   Z
logsigmoidrã   rJ   Úpir5   Ú	clamp_minrÈ   rÍ   ÚlistrÐ   )r‰   r?   r   rŽ   Z	durationsrC   Únoise_scaler   Zrandom_posteriorZlog_determinant_posterior_sumZlatents_posteriorrÑ   rË   rÉ   rÊ   ZlogqZlog_determinant_sumÚlatentsZnllrÍ   r°   Úlog_durationr!   r!   r"   r   	  sp    



&ÿÿ
  ÿ

 ÿ*ÿÿ
.ÿ&ÿÿz'VitsStochasticDurationPredictor.forward)NNFr*   ©r   r   r   r{   r   r”   r!   r!   r‹   r"   rì   è  s    rì   c                       s&   e Zd Z‡ fdd„Zddd„Z‡  ZS )ÚVitsDurationPredictorc                    s¬   t ƒ  ¡  |j}|j}t |j¡| _tj|j	|||d d| _
tj||jd| _tj||||d d| _tj||jd| _t |dd¡| _|jdkr¨t |j|j	d¡| _d S )NrV   )ry   ©Zepsr   r   )rz   r{   rÔ   Z"duration_predictor_filter_channelsr   r€   rí   r   r…   r|   Úconv_1rÙ   Úlayer_norm_epsÚnorm_1Úconv_2Únorm_2Úprojr„   rÃ   )r‰   rp   rw   rÞ   r‹   r!   r"   r{   O  s    

zVitsDurationPredictor.__init__Nc                 C   s¸   t  |¡}|d k	r*t  |¡}||  |¡ }|  || ¡}t  |¡}|  | dd¡¡ dd¡}|  |¡}|  || ¡}t  |¡}|  	| dd¡¡ dd¡}|  |¡}|  
|| ¡}|| S rÛ   )r   rö   rÃ   r  Zrelur  rÜ   r   r  r  r  )r‰   r?   r   rŽ   r!   r!   r"   r   ^  s    





zVitsDurationPredictor.forward)Nrþ   r!   r!   r‹   r"   rÿ   N  s   rÿ   c                
       s   e Zd ZdZedœ‡ fdd„Zejeedœdd„Z	deje
ej e
ej e
ej eeeje
ej f d
œdd„Zdd„ Zdd„ Zdd„ Z‡  ZS )ÚVitsAttentionz?Multi-headed attention with relative positional representation.r–   c                    s,  t ƒ  ¡  |j| _|j| _|j| _|j| _| j| j | _	| j	d | _
| j	| j | jkrptd| j› d| j› dƒ‚tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _| jr(t t d| jd d | j	¡| j
 ¡| _t t d| jd d | j	¡| j
 ¡| _d S )Nrô   zIhidden_size must be divisible by num_attention_heads (got `hidden_size`: z and `num_attention_heads`: z).)r»   r   rV   )rz   r{   r|   rò   Znum_attention_headsÚ	num_headsZattention_dropoutr   Úwindow_sizeÚhead_dimÚscalingrY   r   ÚLinearZuse_biasÚk_projÚv_projÚq_projÚout_projrè   r   r÷   Ú	emb_rel_kÚ	emb_rel_vrœ   r‹   r!   r"   r{   v  s$    
ÿ(zVitsAttention.__init__)ÚtensorÚseq_lenr0   c                 C   s    |  ||| j| j¡ dd¡ ¡ S rë   )Úviewr  r
  rÜ   Ú
contiguous)r‰   r  r  r0   r!   r!   r"   Ú_shape  s    zVitsAttention._shapeNF)r   Úkey_value_statesÚattention_maskÚlayer_head_maskÚoutput_attentionsrÄ   c                 C   s  |  ¡ \}}}|  |¡| j }	|  |  |¡d|¡}
|  |  |¡d|¡}|| j d| jf}|  |	||¡j|Ž }	|
j|Ž }
|j|Ž }|
  d¡}t	 
|	|
 dd¡¡}|  ¡ || j ||fkrÚtd|| j ||f› d|  ¡ › ƒ‚| jdk	r|  | j|¡}t	 |	| dd¡¡}|  |¡}||7 }|dk	r„|  ¡ |d||fkrZtd|d||f› d|  ¡ › ƒ‚| || j||¡| }| || j ||¡}tjj|dd	}|dk	r|  ¡ | jfkrÌtd
| jf› d|  ¡ › ƒ‚| dddd¡| || j||¡ }| || j ||¡}|r0| || j||¡}| || j ||¡}nd}tjj|| j| jd}t	 
||¡}|  ¡ || j || jfkr–td|| j|| jf› d|  ¡ › ƒ‚| jdk	rÎ|  | j|¡}|  |¡}t	 ||¡}||7 }| || j|| j¡}| dd¡}| ||| j¡}|  |¡}||fS )z#Input shape: Batch x Time x Channelr=   r   rV   z$Attention weights should be of size z	, but is Nrõ   z!Attention mask should be of size rR   z/Head mask for a single layer should be of size )ÚpÚtrainingz `attn_output` should be of size )r+   r  r  r  r  r  r  r
  r  r   ZbmmrÜ   rY   r	  Ú_get_relative_embeddingsr  ÚmatmulÚ'_relative_position_to_absolute_positionr   rL   r[   r   r  r  Ú'_absolute_position_to_relative_positionrá   rò   r  )r‰   r   r  r  r  r  r0   r)   r°   Zquery_statesZ
key_statesZvalue_statesZ
proj_shaper1   Úattn_weightsZkey_relative_embeddingsZrelative_logitsZrel_pos_biasZattn_weights_reshapedZ
attn_probsZattn_outputZvalue_relative_embeddingsZrelative_weightsr!   r!   r"   r   ’  sl    


ÿ

ÿ
ÿ"ÿ

zVitsAttention.forwardc              	   C   sn   t || jd  dƒ}|dkr6tj |dd||ddg¡}t | jd | dƒ}|d|  d }|d d …||…f S )Nr   r   rV   )rX   r	  r   rL   r:   )r‰   Zrelative_embeddingsrå   Z
pad_lengthZslice_start_positionZslice_end_positionr!   r!   r"   r  ô  s    z&VitsAttention._get_relative_embeddingsc              	   C   s–   |  ¡ \}}}tj |ddddddg¡}| ||d | g¡}tj |d|d ddg¡}| ||d d| d g¡}|d d …d |…|d d …f }|S ©Nr   r   rV   ©r+   r   rL   r:   r  ©r‰   ÚxZbatch_headsrå   r°   Zx_flatZx_finalr!   r!   r"   r   ý  s    z5VitsAttention._relative_position_to_absolute_positionc              	   C   sŽ   |  ¡ \}}}tj |d|d ddddg¡}| ||d ||d   g¡}tj ||dddg¡}| ||d| g¡d d …d d …dd …f }|S r#  r$  r%  r!   r!   r"   r!    s    *z5VitsAttention._absolute_position_to_relative_position)NNNF)r   r   r   r   r   r{   r   ÚTensorr“   r  r   r.   r   r   r  r   r!  r”   r!   r!   r‹   r"   r  s  s$       úùb	r  c                       s$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )ÚVitsFeedForwardc                    s¦   t ƒ  ¡  t |j|j|j¡| _t |j|j|j¡| _t 	|j
¡| _t|jtƒr^t|j | _n|j| _|jdkrœ|jd d }|jd }||ddddg| _nd | _d S )Nr   rV   r   )rz   r{   r   r…   r|   Zffn_dimZffn_kernel_sizer  r  r€   Zactivation_dropoutr   Ú
isinstanceZ
hidden_actÚstrr	   Úact_fnry   )r‰   rp   Úpad_leftÚ	pad_rightr‹   r!   r"   r{     s    


zVitsFeedForward.__init__c                 C   s¢   |  ddd¡}|  ddd¡}|| }| jd k	r>tj || j¡}|  |¡}|  |¡}|  |¡}|| }| jd k	r~tj || j¡}|  |¡}|| }|  ddd¡}|S )Nr   rV   r   )	râ   ry   r   rL   r:   r  r+  r   r  )r‰   r   r   r!   r!   r"   r   ,  s    





zVitsFeedForward.forwardrþ   r!   r!   r‹   r"   r(    s   r(  c                       sB   e Zd Zedœ‡ fdd„Zd	ejejeej e	dœdd„Z
‡  ZS )
ÚVitsEncoderLayerr–   c                    sX   t ƒ  ¡  t|ƒ| _t |j¡| _tj|j	|j
d| _t|ƒ| _tj|j	|j
d| _d S )Nr   )rz   r{   r  Ú	attentionr   r€   Zhidden_dropoutr   rÙ   r|   r  Ú
layer_normr(  Úfeed_forwardÚfinal_layer_normrœ   r‹   r!   r"   r{   D  s    


zVitsEncoderLayer.__init__NF)r   r   r  r  c                 C   sp   |}| j |||d\}}|  |¡}|  || ¡}|}|  ||¡}|  |¡}|  || ¡}|f}|rl||f7 }|S )N)r   r  r  )r/  r   r0  r1  r2  )r‰   r   r   r  r  r¸   r"  rN   r!   r!   r"   r   L  s     ý



zVitsEncoderLayer.forward)NF)r   r   r   r   r{   r   r'  r   r   r.   r   r”   r!   r!   r‹   r"   r.  C  s     ûûr.  c                
       s\   e Zd Zedœ‡ fdd„Zdejejeej ee	 ee	 ee	 e
eef dœdd„Z‡  ZS )	ÚVitsEncoderr–   c                    sB   t ƒ  ¡  ˆ | _t ‡ fdd„tˆ jƒD ƒ¡| _d| _ˆ j	| _	d S )Nc                    s   g | ]}t ˆ ƒ‘qS r!   )r.  r¯   r–   r!   r"   r®   n  s     z(VitsEncoder.__init__.<locals>.<listcomp>F)
rz   r{   rp   r   r}   r‡   Znum_hidden_layersÚlayersÚgradient_checkpointingÚ	layerdroprœ   r‹   r–   r"   r{   k  s
    
 zVitsEncoder.__init__N)r   r   r  r  Úoutput_hidden_statesÚreturn_dictrÄ   c                    s   |rdnd }ˆ rdnd }|d k	r,t ||jƒ}|| }tƒ }	| jD ]œ}
|rR||f }tj dd¡}| jon|| jk }|rx|	rÂ| j	rª| jrª‡ fdd„}t
jj ||
ƒ|||¡}n|
|||ˆ d}|d }|rÊd}ˆ r@||d f }q@|| }|rô||f }|stdd	„ |||fD ƒƒS t|||d
S )Nr!   r   r   c                    s   ‡ ‡fdd„}|S )Nc                     s   ˆ | ˆfžŽ S r3   r!   )r?   )Úmoduler  r!   r"   Úcustom_forward”  s    zJVitsEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr!   )r9  r:  ©r  )r9  r"   Úcreate_custom_forward“  s    z2VitsEncoder.forward.<locals>.create_custom_forward)r  r   r  )NNc                 s   s   | ]}|d k	r|V  qd S r3   r!   )r«   Úvr!   r!   r"   Ú	<genexpr>´  s      z&VitsEncoder.forward.<locals>.<genexpr>)r$   r   r   )r2   r(   r
   r4  rI   ÚrandomÚuniformr  r6  r5  r   rƒ   Ú
checkpointÚtupler   )r‰   r   r   r  r  r7  r8  Zall_hidden_statesZall_self_attentionsZdeepspeed_zero3_is_enabledZencoder_layerZdropout_probabilityZskip_the_layerr<  Zlayer_outputsr!   r;  r"   r   r  sR    	

üü
ýzVitsEncoder.forward)NNNN)r   r   r   r   r{   r   r   r   r'  r.   r   r   r   r   r”   r!   r!   r‹   r"   r3  j  s       ù
ør3  c                
       sv   e Zd ZdZedœ‡ fdd„Zdd„ Zdd„ Zdej	ej
eej	 ee ee ee eeej	 ef dœdd„Z‡  ZS )ÚVitsTextEncoderzs
    Transformer encoder that uses relative positional representation instead of absolute positional encoding.
    r–   c                    sN   t ƒ  ¡  || _t |j|j|j¡| _t	|ƒ| _
tj|j|jd dd| _d S )NrV   r   )rw   )rz   r{   rp   r   Ú	EmbeddingZ
vocab_sizer|   Zpad_token_idÚembed_tokensr3  Úencoderr…   r˜   Úprojectrœ   r‹   r!   r"   r{   Â  s
    

zVitsTextEncoder.__init__c                 C   s   | j S r3   ©rE  ©r‰   r!   r!   r"   Úget_input_embeddingsÉ  s    z$VitsTextEncoder.get_input_embeddingsc                 C   s
   || _ d S r3   rH  )r‰   rU   r!   r!   r"   Úset_input_embeddingsÌ  s    z$VitsTextEncoder.set_input_embeddingsNT)Ú	input_idsr   r  r  r7  r8  rÄ   c                 C   sª   |   |¡t | jj¡ }| j||||||d}|s:|d n|j}	|  |	 dd¡¡ dd¡| }
t	j
|
| jjdd\}}|s”|	||f|dd …  }|S t|	|||j|jdS )N)r   r   r  r  r7  r8  r   r   rV   rR   )r$   r%   r&   r   r   )rE  rã   rb   rp   r|   rF  r$   rG  rÜ   r   r   r˜   r#   r   r   )r‰   rL  r   r  r  r7  r8  r   Zencoder_outputsr$   rŸ   r%   r&   rN   r!   r!   r"   r   Ï  s,    	ú	ûzVitsTextEncoder.forward)NNNT)r   r   r   r   r   r{   rJ  rK  r   r'  r   r   r.   r   r   r#   r   r”   r!   r!   r‹   r"   rC  ½  s"       ùørC  c                   @   s2   e Zd ZdZeZdZdZdZdd„ Z	ddd	„Z
d
S )ÚVitsPreTrainedModelz†
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    ZvitsrL  Tc                 C   sü   t |tjƒr:|jjjd| jjd |jdk	rø|jj 	¡  n¾t |tj
ƒrb|jj 	¡  |jj d¡ n–t |tjƒrºtj |j¡ |jdk	røt |j|j|jd   ¡}tjj|j| |d n>t |tjƒrø|jjjd| jjd |jdk	rø|jj|j  	¡  dS )zInitialize the weightsr>   )r    ZstdNr*   r   )rk   rl   )r)  r   r  rs   ÚdataZnormal_rp   Zinitializer_ranger»   Zzero_rÙ   Zfill_r…   ÚinitZkaiming_normal_rã   rb   rÓ   ru   rw   Zuniform_rD  Zpadding_idx)r‰   r9  Úkr!   r!   r"   Ú_init_weights   s     


z!VitsPreTrainedModel._init_weightsFc                 C   s   t |tƒr||_d S r3   )r)  rC  r5  )r‰   r9  rU   r!   r!   r"   Ú_set_gradient_checkpointing  s    
z/VitsPreTrainedModel._set_gradient_checkpointingN)F)r   r   r   r   r   Úconfig_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingrQ  rR  r!   r!   r!   r"   rM  õ  s   rM  aI  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`VitsConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aÎ  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
            1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        speaker_id (`int`, *optional*):
            Which speaker embedding to use. Only used for multispeaker models.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z6The complete VITS model, for text-to-speech synthesis.c                       sŠ   e Zd Zedœ‡ fdd„Zdd„ Zeeƒee	e
ddeej eej ee ee ee ee eej eee e	f dœd	d
„ƒƒZ‡  ZS )Ú	VitsModelr–   c                    s–   t ƒ  |¡ || _t|ƒ| _t|ƒ| _t|ƒ| _|j	rBt
|ƒ| _n
t|ƒ| _|jdkrht |j|j¡| _t|ƒ| _|j| _|j| _|j| _|  ¡  d S rç   )rz   r{   rp   rC  Útext_encoderrÌ   rÑ   r¹   ÚdecoderÚ"use_stochastic_duration_predictionrì   Úduration_predictorrÿ   Únum_speakersr   rD  r„   Úembed_speakerr•   Zposterior_encoderÚspeaking_raterû   Únoise_scale_durationZ	post_initrœ   r‹   r!   r"   r{   M  s    





zVitsModel.__init__c                 C   s   | j S r3   )rU  rI  r!   r!   r"   Úget_encoderg  s    zVitsModel.get_encoder)Úoutput_typerS  N)rL  r  Ú
speaker_idr  r7  r8  ÚlabelsrÄ   c           #   
   C   sŠ  |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}|dk	rT| d¡ ¡ }nt |¡ d¡ ¡ }| j jdkrÜ|dk	rÜd|  kr”| j jk s®n t	d| j jd › dƒ‚t
|tƒrÊtjd|| jd}|  |¡ d¡}	nd}	|dk	rðtd	ƒ‚| j||||||d
}
|s|
d n|
j}| dd¡}| dd¡}|s@|
d n|
j}|sT|
d n|
j}| j jr|| j|||	d| jd}n|  |||	¡}d| j }t t |¡| | ¡}t t |ddg¡d¡ ¡ }tj| ¡ |j |jd}| d¡| d¡k }| d¡ !|j ¡}t |d¡t |d¡ }|j"\}}}}t #|d¡ $|| d¡}tj||j |jd}| d¡|k }| !|j ¡ $|||¡}|t%j& '|ddddddg¡dd…dd…f  }| d¡ dd¡| }t (| )d¡|¡ dd¡}t (| )d¡|¡ dd¡}|t *|¡t |¡ | j+  }| j,|||	dd}|| }|  -||	¡} |  )d¡} |t. /| j j0¡ }!|st| |!|f|
dd…  }"|"S t1| |!||
j2|
j3dS )aZ  
        labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
            Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation.

        Returns:

        Example:

        ```python
        >>> from transformers import VitsTokenizer, VitsModel, set_seed
        >>> import torch

        >>> tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
        >>> model = VitsModel.from_pretrained("facebook/mms-tts-eng")

        >>> inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")

        >>> set_seed(555)  # make deterministic

        >>> with torch.no_grad():
        ...     outputs = model(inputs["input_ids"])
        >>> outputs.waveform.shape
        torch.Size([1, 45824])
        ```
        Nr=   r   r   z Set `speaker_id` in the range 0-Ú.)r   )r+   Z
fill_valueró   z&Training of VITS is not supported yet.)rL  r   r  r  r7  r8  rV   T)rC   rû   r*   )r(   ró   r   rÎ   )r   r   r   r   r   )4rp   r  r7  Zuse_return_dictZ	unsqueezeÚfloatr   Z	ones_likerY  rY   r)  r“   Úfullró   rZ  ÚNotImplementedErrorrU  r$   rÜ   r%   r&   rW  rX  r\  r[  ÚceilrK   rù   r]   ÚlongZarangerX   r(   r-   rZ   r\   r  r   rL   r:   r  Zsqueezerž   rû   rÑ   rV  rI   Úprodr½   r   r   r   )#r‰   rL  r  r_  r  r7  r8  r`  Zinput_padding_maskZspeaker_embeddingsZtext_encoder_outputr   r%   r&   rý   Zlength_scaleÚdurationZpredicted_lengthsÚindicesZoutput_padding_maskZ	attn_maskrä   r°   Zoutput_lengthZinput_lengthZcum_durationZvalid_indicesZpadded_indicesZattnZprior_latentsrü   r   r   r   rN   r!   r!   r"   r   j  sŒ    &ÿ
ú
û
.
ûzVitsModel.forward)NNNNNNN)r   r   r   r   r{   r]  r   ÚVITS_INPUTS_DOCSTRINGr   r   Ú_CONFIG_FOR_DOCr   r   r'  r“   r.   r   r   r   r   r   r”   r!   r!   r‹   r"   rT  H  s*   
       ø÷rT  )N)Fr8   r9   r9   r9   )Cr   rã   Údataclassesr   Útypingr   r   r   r   ÚnumpyrI   r   Ztorch.utils.checkpointr   Zactivationsr	   Zintegrations.deepspeedr
   Zmodeling_outputsr   r   Zmodeling_utilsr   rƒ   r   r   r   r   Zconfiguration_vitsr   Z
get_loggerr   Úloggerrk  Z"VITS_PRETRAINED_MODEL_ARCHIVE_LISTr   r#   r'  r(   r“   r2   ZjitÚscriptr7   rQ   rM   ÚModulero   r•   r£   r¹   rÆ   rÌ   rÒ   rÝ   ræ   rì   rÿ   r  r(  r.  r3  rC  rM  ZVITS_START_DOCSTRINGrj  rT  r!   r!   r!   r"   Ú<module>   sr   
ÿ  
     ÷
J 	P:T.+f% '*'S8#þ