U
    9%e                     @   s$   d Z ddlmZ G dd deZdS )z$Speech processor class for SpeechT5.   )ProcessorMixinc                       sH   e Zd ZdZdZdZ fddZdd Zdd	 Zd
d Z	dd Z
  ZS )SpeechT5Processora}  
    Constructs a SpeechT5 processor which wraps a feature extractor and a tokenizer into a single processor.

    [`SpeechT5Processor`] offers all the functionalities of [`SpeechT5FeatureExtractor`] and [`SpeechT5Tokenizer`]. See
    the docstring of [`~SpeechT5Processor.__call__`] and [`~SpeechT5Processor.decode`] for more information.

    Args:
        feature_extractor (`SpeechT5FeatureExtractor`):
            An instance of [`SpeechT5FeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`SpeechT5Tokenizer`):
            An instance of [`SpeechT5Tokenizer`]. The tokenizer is a required input.
    ZSpeechT5FeatureExtractorZSpeechT5Tokenizerc                    s   t  || d S )N)super__init__)selffeature_extractor	tokenizer	__class__ o/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/speecht5/processing_speecht5.pyr   $   s    zSpeechT5Processor.__init__c                 O   sb  | dd}| dd}| dd}| dd}| dd}|dk	rT|dk	rTtd|dk	rl|dk	rltd|dkr|dkr|dkr|dkrtd	|dk	r| j|f|d|i|}n|dk	r| j|f|}nd}|dk	r| j|||d
|}	|	d }
n&|dk	r| j|f|}	|	d }
nd}	|dkr0|	S |	dk	r^|
|d< |	d}|dk	r^||d< |S )a  
        Processes audio and text input, as well as audio and text targets.

        You can process audio by using the argument `audio`, or process audio targets by using the argument
        `audio_target`. This forwards the arguments to SpeechT5FeatureExtractor's
        [`~SpeechT5FeatureExtractor.__call__`].

        You can process text by using the argument `text`, or process text labels by using the argument `text_target`.
        This forwards the arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.__call__`].

        Valid input combinations are:

        - `text` only
        - `audio` only
        - `text_target` only
        - `audio_target` only
        - `text` and `audio_target`
        - `audio` and `audio_target`
        - `text` and `text_target`
        - `audio` and `text_target`

        Please refer to the docstring of the above two methods for more information.
        audioNtexttext_targetaudio_targetsampling_ratez\Cannot process both `audio` and `text` inputs. Did you mean `audio_target` or `text_target`?z\Cannot process both `audio_target` and `text_target` inputs. Did you mean `audio` or `text`?zaYou need to specify either an `audio`, `audio_target`, `text`, or `text_target` input to process.)r   r   input_values	input_idslabelsattention_maskdecoder_attention_mask)pop
ValueErrorr   r   get)r   argskwargsr   r   r   r   r   inputstargetsr   r   r   r   r   __call__'   sJ     






zSpeechT5Processor.__call__c           
      O   sX  | dd}| dd}| dd}|dk	r<|dk	r<td|dkr\|dkr\|dkr\td|dk	rz| jj|f||}n|dk	r| jj|f|}nd}|dk	rd|kst|trd|d kr| jj|f|}|d }n8| jj}| jj| j_| jj|f||}|| j_|d }nd}|dkr&|S |dk	rT||d< |	d}	|	dk	rT|	|d	< |S )
au  
        Collates the audio and text inputs, as well as their targets, into a padded batch.

        Audio inputs are padded by SpeechT5FeatureExtractor's [`~SpeechT5FeatureExtractor.pad`]. Text inputs are padded
        by SpeechT5Tokenizer's [`~SpeechT5Tokenizer.pad`].

        Valid input combinations are:

        - `input_ids` only
        - `input_values` only
        - `labels` only, either log-mel spectrograms or text tokens
        - `input_ids` and log-mel spectrogram `labels`
        - `input_values` and text `labels`

        Please refer to the docstring of the above two methods for more information.
        r   Nr   r   z:Cannot process both `input_values` and `input_ids` inputs.zZYou need to specify either an `input_values`, `input_ids`, or `labels` input to be padded.    r   r   )
r   r   r   padr   
isinstancelistZfeature_sizeZnum_mel_binsr   )
r   r   r   r   r   r   r   r   Zfeature_size_hackr   r   r   r   r    n   s@    






zSpeechT5Processor.padc                 O   s   | j j||S )z
        This method forwards all its arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.batch_decode`]. Please refer
        to the docstring of this method for more information.
        )r   batch_decoder   r   r   r   r   r   r#      s    zSpeechT5Processor.batch_decodec                 O   s   | j j||S )z
        This method forwards all its arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r   decoder$   r   r   r   r%      s    zSpeechT5Processor.decode)__name__
__module____qualname____doc__Zfeature_extractor_classZtokenizer_classr   r   r    r#   r%   __classcell__r   r   r	   r   r      s   G<r   N)r)   Zprocessing_utilsr   r   r   r   r   r   <module>   s   