U
    9%eǂ                     @   s  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZmZmZmZmZ d	d
lmZ eeZdZddgZeG dd deZeG dd deZeG dd deZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd dejZ#G dd dejZ$G d d! d!ejZ%G d"d# d#ejZ&G d$d% d%ejZ'G d&d' d'eZ(d(Z)d)Z*ed*e)G d+d, d,e(Z+dS )-z PyTorch EnCodec model.    N)	dataclass)ListOptionalTupleUnion)nn   )PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )EncodecConfigr   zfacebook/encodec_24khzzfacebook/encodec_48khzc                   @   s.   e Zd ZU dZdZejed< dZejed< dS )EncodecOutputai  
    Args:
        audio_codes (`torch.FloatTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_values (`torch.FlaotTensor` of shape `(batch_size, sequence_length)`, *optional*)
            Decoded audio values, obtained using the decoder part of Encodec.
    Naudio_codesaudio_values)	__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r    r   r   k/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/encodec/modeling_encodec.pyr   2   s   
r   c                   @   s.   e Zd ZU dZdZejed< dZejed< dS )EncodecEncoderOutputa  
    Args:
        audio_codes (`torch.FloatTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
            Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding.
    Nr   audio_scales)	r   r   r   r   r   r   r   r   r   r   r   r   r   r   @   s   
r   c                   @   s    e Zd ZU dZdZejed< dS )EncodecDecoderOutputz
    Args:
        audio_values (`torch.FloatTensor`  of shape `(batch_size, segment_length)`, *optional*):
            Decoded audio values, obtained using the decoder part of Encodec.
    Nr   )r   r   r   r   r   r   r   r   r   r   r   r   r   N   s   
r   c                       sz   e Zd ZdZdeeeeed fddZedejeeeeddd	Z	edeje
eef eedddZdd Z  ZS )EncodecConv1dz;Conv1d with asymmetric or causal padding and normalization.r   )in_channelsout_channelskernel_sizestridedilationc              	      s   t    |j| _|j| _|j| _| jdkr<td| j |dkrj|dkrjtd| d| d| d t	j
|||||d| _| jd	krt	j| j| _n| jd
krt	d|| _d S )Nweight_normtime_group_normIself.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got r   zQEncodecConv1d has been initialized with stride > 1 and dilation > 1 (kernel_size=z stride=z, dilation=z).r%   r'   r(   )super__init__use_causal_convcausalpad_mode	norm_type
ValueErrorloggerwarningr   Conv1dconvutilsr'   	GroupNormnorm)selfconfigr!   r"   r#   r$   r%   	__class__r   r   r,   \   s"    




zEncodecConv1d.__init__r   )hidden_statesr#   r$   padding_totalreturnc                 C   s@   | j d }|| | | d }t|d | ||  }|| S )zSee `pad_for_conv1d`.r   )shapemathceil)r=   r#   r$   r>   lengthZn_framesZideal_lengthr   r   r   _get_extra_padding_for_conv1dv   s    
z+EncodecConv1d._get_extra_padding_for_conv1dzero        )r=   paddingsmodevaluec                 C   s   | j d }|\}}|dks,tj| |||S t||}d}||kr`|| d }tj| d|f} tj| |||}	|	j d | }
|	dd|
f S )zTiny wrapper around torch.nn.functional.pad, just to allow for reflect padding on small input.
        If this is the case, we insert extra 0 padding to the right before the reflection happens.
        r@   Zreflectr   r   .N)rA   r   
functionalpadmax)r=   rH   rI   rJ   rD   padding_leftpadding_rightZmax_padZ	extra_padpaddedendr   r   r   _pad1d   s    

zEncodecConv1d._pad1dc           	      C   s   | j jd }| j jd }| j jd }|d | d }|| }| ||||}| jrj| j|||f| jd}n*|d }|| }| j|||| f| jd}|  |}| jdkr| 	|}|S )Nr   r   )rI      r(   )
r5   r#   r$   r%   rE   r.   rR   r/   r0   r8   )	r9   r=   r#   r$   r%   r>   Zextra_paddingrO   rN   r   r   r   forward   s&     
 


zEncodecConv1d.forward)r   r   )r   )rF   rG   )r   r   r   r   intr,   staticmethodr   TensorrE   r   strfloatrR   rT   __classcell__r   r   r;   r   r    Y   s,              	"r    c                       s6   e Zd ZdZdeeeed fddZdd Z  ZS )	EncodecConvTranspose1dzDConvTranspose1d with asymmetric or causal padding and normalization.r   )r!   r"   r#   r$   c                    s   t    |j| _|j| _|j| _| jdkr<td| j t||||| _	|jdkrjtj
| j	| _	n|jdkrtd|| _| js| jdkstdd S )Nr&   r)   r'   r(   r         ?zB`trim_right_ratio` != 1.0 only makes sense for causal convolutions)r+   r,   r-   r.   trim_right_ratior0   r1   r   ZConvTranspose1dr5   r6   r'   r7   r8   )r9   r:   r!   r"   r#   r$   r;   r   r   r,      s    




zEncodecConvTranspose1d.__init__c                 C   s   | j jd }| j jd }|| }|  |}| jdkr>| |}| jrVt|| j }n|d }|| }|j	d | }|d||f }|S )Nr   r(   rS   r@   .)
r5   r#   r$   r0   r8   r.   rB   rC   r]   rA   )r9   r=   r#   r$   r>   rO   rN   rQ   r   r   r   rT      s    


zEncodecConvTranspose1d.forward)r   )r   r   r   r   rU   r,   rT   rZ   r   r   r;   r   r[      s   r[   c                       s(   e Zd ZdZ fddZdd Z  ZS )EncodecLSTMzz
    LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout.
    c                    s    t    t|||j| _d S N)r+   r,   r   LSTMZnum_lstm_layerslstm)r9   r:   	dimensionr;   r   r   r,      s    
zEncodecLSTM.__init__c                 C   s2   | ddd}| |d | }| ddd}|S )NrS   r   r   )permutera   )r9   r=   r   r   r   rT      s    zEncodecLSTM.forward)r   r   r   r   r,   rT   rZ   r   r   r;   r   r^      s   r^   c                       s6   e Zd ZdZeeee d fddZdd Z  Z	S )EncodecResnetBlockz>
    Residual block from SEANet model as used by EnCodec.
    )r:   dim	dilationsc              	      s   t    |jdf}t|t|kr,td||j }g }tt||D ]Z\}\}}	|dkr`|n|}
|t|d krx|n|}|t	 g7 }|t
||
|||	dg7 }qHt|| _|jrt
|||dd| _n
t | _d S )Nr   z7Number of kernel sizes should match number of dilationsr   r*   )r#   )r+   r,   Zresidual_kernel_sizelenr1   compress	enumeratezipr   ELUr    
ModuleListblockZuse_conv_shortcutshortcutZIdentity)r9   r:   re   rf   Zkernel_sizesZhiddenrm   ir#   r%   Zin_chsZout_chsr;   r   r   r,      s    


zEncodecResnetBlock.__init__c                 C   s&   |}| j D ]}||}q
| || S r_   )rm   rn   )r9   r=   residuallayerr   r   r   rT     s    

zEncodecResnetBlock.forward)
r   r   r   r   r   rU   r   r,   rT   rZ   r   r   r;   r   rd      s   rd   c                       s.   e Zd ZdZed fddZdd Z  ZS )EncodecEncoderz"SEANet encoder as used by EnCodec.r:   c              	      s   t    t||j|j|jg}d}t|jD ]p}||j }t|j	D ] }|t
|||j| dgg7 }qF|t g7 }|t|||d |d |dg7 }|d9 }q.|t|||j g7 }|t g7 }|t|||j |j|jg7 }t|| _d S )Nr   rS   r#   r$   )r+   r,   r    audio_channelsnum_filtersr#   reversedupsampling_ratiosrangenum_residual_layersrd   dilation_growth_rater   rk   r^   hidden_sizelast_kernel_sizerl   layers)r9   r:   modelscalingratiocurrent_scalejr;   r   r   r,     s    

 
zEncodecEncoder.__init__c                 C   s   | j D ]}||}q|S r_   r~   r9   r=   rq   r   r   r   rT   ,  s    

zEncodecEncoder.forwardr   r   r   r   r   r,   rT   rZ   r   r   r;   r   rr     s   rr   c                       s.   e Zd ZdZed fddZdd Z  ZS )EncodecDecoderz"SEANet decoder as used by EnCodec.rs   c              	      s   t    tdt|j }t||j||j |jg}|t	|||j g7 }|jD ]t}||j }|t
 g7 }|t|||d |d |dg7 }t|jD ]$}|t||d |j| dfg7 }q|d }qR|t
 g7 }|t||j|j|jg7 }t
|| _d S )NrS   rt   r   )r+   r,   rU   rg   rx   r    r|   rv   r#   r^   r   rk   r[   ry   rz   rd   r{   ru   r}   rl   r~   )r9   r:   r   r   r   r   r   r;   r   r   r,   5  s     


"
zEncodecDecoder.__init__c                 C   s   | j D ]}||}q|S r_   r   r   r   r   r   rT   N  s    

zEncodecDecoder.forwardr   r   r   r;   r   r   2  s   r   c                       s>   e Zd ZdZed fddZdd Zdd Zd	d
 Z  Z	S )EncodecEuclideanCodebookz!Codebook with Euclidean distance.rs   c                    sj   t    t|j|j}|j| _| dtdg | dt|j | d| | d|  d S )NinitedTZcluster_sizeembedZ	embed_avg)	r+   r,   r   zeroscodebook_sizeZcodebook_dimZregister_bufferrW   clone)r9   r:   r   r;   r   r   r,   W  s    
z!EncodecEuclideanCodebook.__init__c                 C   sV   | j  }|djddd}|d| |  |djddd  }|jddj}|S )NrS   r   Tkeepdimr   r@   )re   )r   tpowsumrM   indices)r9   r=   r   Zscaled_statesdist	embed_indr   r   r   quantizeb  s
    
&z!EncodecEuclideanCodebook.quantizec                 C   s8   |j }|d|d f}| |}|j|d d  }|S )Nr@   )rA   Zreshaper   view)r9   r=   rA   r   r   r   r   encodei  s
    
zEncodecEuclideanCodebook.encodec                 C   s   t j|| j}|S r_   )r   rK   Z	embeddingr   r9   r   r   r   r   r   decodes  s    zEncodecEuclideanCodebook.decode)
r   r   r   r   r   r,   r   r   r   rZ   r   r   r;   r   r   T  s
   
r   c                       s6   e Zd ZdZed fddZdd Zdd Z  ZS )	EncodecVectorQuantizationzY
    Vector quantization implementation. Currently supports only euclidean distance.
    rs   c                    s   t    t|| _d S r_   )r+   r,   r   codebookr9   r:   r;   r   r   r,   }  s    
z"EncodecVectorQuantization.__init__c                 C   s   | ddd}| j|}|S Nr   rS   r   )rc   r   r   )r9   r=   Zembed_inr   r   r   r     s    z EncodecVectorQuantization.encodec                 C   s   | j |}|ddd}|S r   )r   r   rc   r   r   r   r   r     s    z EncodecVectorQuantization.decode)	r   r   r   r   r   r,   r   r   rZ   r   r   r;   r   r   x  s   r   c                       sl   e Zd ZdZed fddZdee edddZ	de
jee e
jd	d
dZe
je
jdddZ  ZS )EncodecResidualVectorQuantizerzResidual Vector Quantizer.rs   c                    sF   t     j| _ j| _ j| _t fddt jD | _d S )Nc                    s   g | ]}t  qS r   )r   ).0_rs   r   r   
<listcomp>  s     z;EncodecResidualVectorQuantizer.__init__.<locals>.<listcomp>)	r+   r,   r   
frame_ratenum_quantizersr   rl   ry   r~   r   r;   rs   r   r,     s
    
z'EncodecResidualVectorQuantizer.__init__N)	bandwidthr?   c                 C   sH   t | j| j }| j}|dk	rD|dkrDttdt |d | }|S )z:Return num_quantizers based on specified target bandwidth.NrG   r   i  )rB   log2r   r   r   rU   rM   floor)r9   r   Zbw_per_qr   r   r   r    get_num_quantizers_for_bandwidth  s
    z?EncodecResidualVectorQuantizer.get_num_quantizers_for_bandwidth)
embeddingsr   r?   c           
      C   sZ   |  |}|}g }| jd| D ]*}||}||}|| }|| q t|}	|	S )z
        Encode a given input tensor with the specified frame rate at the given bandwidth. The RVQ encode method sets
        the appropriate number of quantizers to use and returns indices for each quantizer.
        N)r   r~   r   r   appendr   stack)
r9   r   r   r   rp   Zall_indicesrq   r   	quantizedZout_indicesr   r   r   r     s    



z%EncodecResidualVectorQuantizer.encode)codesr?   c                 C   sB   t jd|jd}t|D ]$\}}| j| }||}|| }q|S )z7Decode the given codes to the quantized representation.rG   )device)r   Ztensorr   ri   r~   r   )r9   r   Zquantized_outro   r   rq   r   r   r   r   r     s    


z%EncodecResidualVectorQuantizer.decode)N)N)r   r   r   r   r   r,   r   rY   rU   r   r   rW   r   r   rZ   r   r   r;   r   r     s
   r   c                   @   s2   e Zd ZdZeZdZdZdZdd Z	ddd	Z
d
S )EncodecPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    Zencodecinput_valuesTc                 C   s\  t |tjr<|jjjd| jjd |jdk	r8|jj	  nt |tj
tjfrj|jj	  |jjd nt |tjrtj|j |jdk	rt|j|j|jd   }tjj|j| |d nt |tjr|jjjd| jjd |jdk	rX|jj|j 	  nRt |tjrX| D ]:\}}d|kr<tj| nd|krtj|d qdS )	zInitialize the weightsrG   )meanZstdNr\   r   )abweightbias)
isinstancer   ZLinearr   dataZnormal_r:   Zinitializer_ranger   Zzero_Z	LayerNormr7   Zfill_r4   initZkaiming_normal_rB   sqrtgroupsr!   r#   Zuniform_Z	EmbeddingZpadding_idxr`   Znamed_parametersZxavier_uniform_Z	constant_)r9   moduleknameparamr   r   r   _init_weights  s,    



z$EncodecPreTrainedModel._init_weightsFc                 C   s   t |ttfr||_d S r_   )r   rr   r   Zgradient_checkpointing)r9   r   rJ   r   r   r   _set_gradient_checkpointing  s    z2EncodecPreTrainedModel._set_gradient_checkpointingN)F)r   r   r   r   r   config_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingr   r   r   r   r   r   r     s   r   aL  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`EncodecConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a)  
    Args:
        input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
            Raw audio input converted to Float and padded to the approriate length in order to be encoded using chunks
            of length self.chunk_length and a stride of `config.chunk_stride`.
        padding_mask (`torch.BoolTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
            Mask to avoid computing scaling factors on padding token indices (can we avoid computing conv on these+).
            Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            <Tip warning={true}>

             `padding_mask` should always be passed, unless the input was truncated or not padded. This is because in
             order to process tensors effectively, the input audio should be padded so that `input_length % stride =
             step` with `step = chunk_length-stride`. This ensures that all chunks are of the same shape

            </Tip>

        bandwidth (`float`, *optional*):
            The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
            bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented as
            `bandwidth == 6.0`
        audio_codes (`torch.FloatTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
            Scaling factor for each `audio_codes` input.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z%The EnCodec neural audio codec model.c                       sf  e Zd Zed fddZdd Zdd Zeje	e
eejeej f dd	d
Zdejejee	 ee eeejeej f ef dddZeeej e
dddZdejeej ejdddZdejejeej ee eeejejf ef dddZeeeeeddejeej ee	 eej eej ee eeejejf ef dddZ  ZS ) EncodecModelrs   c                    sj   t  | || _t|| _t|| _t|| _t	t
| jj| _d| j | jjkr^td|   d S )NrS   z'The codebook_size must be a power of 2.)r+   r,   r:   rr   encoderr   decoderr   	quantizerrU   rB   r   r   Zbits_per_codebookr1   Z	post_initr   r;   r   r   r,     s    


zEncodecModel.__init__c                 C   s   | j S r_   )r   r9   r   r   r   get_encoder(  s    zEncodecModel.get_encoderc                 C   s   | j S r_   )r   r   r   r   r   get_decoder+  s    zEncodecModel.get_decoder)r   r   padding_maskr?   c           
      C   s   |j d }|| jj }| jjdk	rJ|d| jj krJtd| d| jj d}| jjr|| }tj|ddd|j d  }|d	j	ddd

 d }|| }| |}| j||}	|	dd}	|	|fS )z
        Encodes the given input using the underlying VQVAE. If `config.normalize` is set to `True` the input is first
        normalized. The padding mask is required to compute the correct scale.
        r@   Ngh㈵>zDuration of frame (z) is longer than chunk r   Tr   rS   )re   r   g:0yE>r   )rA   r:   Zsampling_rateZchunk_length_sRuntimeError	normalizer   r   r   r   r   r   r   r   	transpose)
r9   r   r   r   rD   durationscalemonor   r   r   r   r   _encode_frame.  s    

zEncodecModel._encode_frameN)r   r   r   return_dictr?   c                 C   sl  |dk	r|n| j j}|dkr(| j jd }|| j jkrNtd| d| j j d|j\}}}|dk sj|dkrxtd| | j j}|dkr|}|}	n| j j}	|dkrt|	 }g }
g }||	 }||	 | dkrtd	t
d|| |	D ]`}|d
||| f 	 }|dddd||| f }| |||\}}|
| || qt|
}
|sb|
|fS t|
|S )a  
        Encodes the input audio waveform into discrete codes.

        Args:
            input_values (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Float values of the input audio waveform.
            padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Padding mask used to pad the `input_values`.
            bandwidth (`float`, *optional*):
                The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
                bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented
                as bandwidth == 6.0

        Returns:
            A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling
            factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
            `codebook` of shape `[batch_size, num_codebooks, frames]`.
        Nr   z)This model doesn't support the bandwidth z. Select one of .r   rS   z1Number of audio channels must be 1 or 2, but got zkThe input length is not properly padded for batched chunked decoding. Make sure to pad the input correctly..)r:   r   Ztarget_bandwidthsr1   rA   chunk_lengthchunk_strider   	ones_likeboolry   r   r   r   r   )r9   r   r   r   r   r   ZchannelsZinput_lengthr   r$   Zencoded_framesscalesstepoffsetmaskframeZencoded_framer   r   r   r   r   H  sD    

zEncodecModel.encode)framesr$   c                 C   sL  t | dkrtd| d j}| d j}| d jd d }|t | d  | d jd  }| d jd }tjdd|d ||ddd }d|d   }tj|||d}	tj||f||d}
d}| D ]^}|jd }|
d||| f  |d | | 7  < |	|||   |d | 7  < ||7 }q|		 dkrDtd	|	 d
|
|	 S )Nr   z!`frames` cannot be an empty list.r@   r   rS   )r   dtypeg      ?.z7`sum_weight` minimum element must be bigger than zero: `)
rg   r1   r   r   rA   r   Zlinspaceabsr   min)r   r$   r   r   rA   
total_sizeZframe_lengthZtime_vecr   Z
sum_weightoutr   r   r   r   r   _linear_overlap_add  s(    

 
( 
z EncodecModel._linear_overlap_add)r   r   r?   c                 C   s@   | dd}| j|}| |}|d k	r<||ddd }|S )Nr   r   r@   )r   r   r   r   r   )r9   r   r   r   outputsr   r   r   _decode_frame  s    
zEncodecModel._decode_frame)r   r   r   r   r?   c                 C   s   |p
| j j}| j j}|dkrPt|dkr:tdt| | |d |d }nBg }t||D ]\}}	| ||	}
||
 q^| || j j	pd}|dk	r|j
d |j
d k r|dd|j
d f }|s|fS t|S )aN  
        Decodes the given frames into an output audio waveform.

        Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
        trimmed.

        Args:
            audio_codes (`torch.FloatTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
                Discret code embeddings computed using `model.encode`.
            audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
                Scaling factor for each `audio_codes` input.
            padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Padding mask used to pad the `input_values`.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        Nr   zExpected one frame, got r   r@   .)r:   r   r   rg   r1   r   rj   r   r   r   rA   r   )r9   r   r   r   r   r   r   Zdecoded_framesr   r   r   r   r   r   r     s     zEncodecModel.decode)output_typer   )r   r   r   r   r   r   r?   c                 C   s   |p
| j j}|dkr"t| }|dk	r:|dkr:td|dk	rR|dkrRtd|dkrv|dkrv| |||d\}}| j||||dd }|s||fS t||dS )a  
        Returns:

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, EncodecModel

        >>> dataset = load_dataset("ashraq/esc50")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model_id = "facebook/encodec_24khz"
        >>> model = EncodecModel.from_pretrained(model_id)
        >>> processor = AutoProcessor.from_pretrained(model_id)

        >>> inputs = processor(raw_audio=audio_sample, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> audio_codes = outputs.audio_codes
        >>> audio_values = outputs.audio_values
        ```NzBYou specified `audio_codes` but did not specify the `audio_scales`zBYou specified `audio_scales` but did not specify the `audio_codes`F)r   r   )r   r   )	r:   r   r   r   r   r1   r   r   r   )r9   r   r   r   r   r   r   r   r   r   r   rT     s    !zEncodecModel.forward)NNN)N)NN)NNNNN)r   r   r   r   r,   r   r   r   rW   rY   rU   r   r   r   r   r   r   r   rV   r   r   r   r   r   r   ENCODEC_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCrT   rZ   r   r   r;   r   r     s\        I.  0
     r   ),r   rB   dataclassesr   typingr   r   r   r   r   Ztorch.utils.checkpointr   Zmodeling_utilsr	   r6   r
   r   r   r   r   Zconfiguration_encodecr   Z
get_loggerr   r2   r   Z%ENCODEC_PRETRAINED_MODEL_ARCHIVE_LISTr   r   r   Moduler    r[   r^   rd   rr   r   r   r   r   r   ZENCODEC_START_DOCSTRINGr   r   r   r   r   r   <module>   sJ   

U4!"$,)!