U
    9%eI$                     @   s   d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
ZddlmZ ddlmZ dd	lmZ eeZd
diZd
ddddiZddddZG dd deZdS )z Tokenization class for SpeechT5.    N)copyfile)AnyDictListOptionalTuple   )PreTrainedTokenizer)logging   )EnglishNumberNormalizer
vocab_filezspm_char.modelzIhttps://huggingface.co/microsoft/speecht5_asr/resolve/main/spm_char.modelzIhttps://huggingface.co/microsoft/speecht5_tts/resolve/main/spm_char.modelzHhttps://huggingface.co/microsoft/speecht5_vc/resolve/main/spm_char.model)zmicrosoft/speecht5_asrzmicrosoft/speecht5_ttszmicrosoft/speecht5_vci   c                       s  e Zd ZdZeZeZeZ	ddgZ
d,eeeef  d	d
 fddZd-ddZedd Zedd Zejdd Zdd Zdd Zdd Zeee dddZdd Zdd  Zd!d" Zd.ee d#d$d%Zd/ee eee  eee d& fd'd(Z d0eee e!e d)d*d+Z"  Z#S )1SpeechT5Tokenizera	  
    Construct a SpeechT5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The begin of sequence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        normalize (`bool`, *optional*, defaults to `False`):
            Whether to convert numeric quantities in the text to their spelt-out english counterparts.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    Z	input_idsZattention_mask<s></s><unk><pad>FN)sp_model_kwargsreturnc           	   	      sf   |d kri n|| _ || _|| _d | _tjf | j | _| j| t j	f |||||| j d| d S )N)	bos_token	eos_token	unk_token	pad_token	normalizer   )
r   r   r   _normalizerspmSentencePieceProcessorsp_modelLoadsuper__init__)	selfr   r   r   r   r   r   r   kwargs	__class__ q/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/speecht5/tokenization_speecht5.pyr    `   s     zSpeechT5Tokenizer.__init__c                 K   s0   | d| j}|rd| }|r(| |}||fS )Nr    )popr   
normalizer)r!   textZis_split_into_wordsr"   r   r%   r%   r&   prepare_for_tokenization}   s    
z*SpeechT5Tokenizer.prepare_for_tokenizationc                 C   s
   | j  S N)r   Zget_piece_sizer!   r%   r%   r&   
vocab_size   s    zSpeechT5Tokenizer.vocab_sizec                 C   s   | j d krt | _ | j S r,   )r   r   r-   r%   r%   r&   r)      s    
zSpeechT5Tokenizer.normalizerc                 C   s
   || _ d S r,   )r   )r!   valuer%   r%   r&   r)      s    c                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r%   )Zconvert_ids_to_tokens).0ir-   r%   r&   
<dictcomp>   s      z/SpeechT5Tokenizer.get_vocab.<locals>.<dictcomp>)ranger.   updateZadded_tokens_encoder)r!   Zvocabr%   r-   r&   	get_vocab   s    zSpeechT5Tokenizer.get_vocabc                 C   s   | j  }d |d< |S )Nr   )__dict__copy)r!   stater%   r%   r&   __getstate__   s    
zSpeechT5Tokenizer.__getstate__c                 C   s8   || _ t| dsi | _tjf | j| _| j| j d S )Nr   )r6   hasattrr   r   r   r   r   r   )r!   dr%   r%   r&   __setstate__   s
    
zSpeechT5Tokenizer.__setstate__)r*   r   c                 C   s   | j j|tdS )zPTake as input a string and return a list of strings (tokens) for words/sub-words)Zout_type)r   encodestr)r!   r*   r%   r%   r&   	_tokenize   s    zSpeechT5Tokenizer._tokenizec                 C   s   | j |S )z0Converts a token (str) in an id using the vocab.)r   Zpiece_to_id)r!   tokenr%   r%   r&   _convert_token_to_id   s    z&SpeechT5Tokenizer._convert_token_to_idc                 C   s   | j |}|S )z=Converts an index (integer) in a token (str) using the vocab.)r   Z	IdToPiece)r!   indexr@   r%   r%   r&   _convert_id_to_token   s    z&SpeechT5Tokenizer._convert_id_to_tokenc                 C   sX   g }d}|D ]2}|| j kr4|| j|| 7 }g }q|| q|| j|7 }| S )z:Converts a sequence of tokens (string) in a single string. )Zall_special_tokensr   decodeappendstrip)r!   tokensZcurrent_sub_tokensZ
out_stringr@   r%   r%   r&   convert_tokens_to_string   s    
z*SpeechT5Tokenizer.convert_tokens_to_string)r   c                 C   s$   |dkr|| j g S || | j g S )z=Build model inputs from a sequence by appending eos_token_id.N)Zeos_token_id)r!   token_ids_0token_ids_1r%   r%   r&    build_inputs_with_special_tokens   s    z2SpeechT5Tokenizer.build_inputs_with_special_tokens)rJ   rK   already_has_special_tokensr   c                    sV   |rt  j||ddS dg}|d kr6dgt| | S dgt| dgt|  | S )NT)rJ   rK   rM   r   r   )r   get_special_tokens_masklen)r!   rJ   rK   rM   Zsuffix_onesr#   r%   r&   rN      s      z)SpeechT5Tokenizer.get_special_tokens_mask)save_directoryfilename_prefixr   c              	   C   s   t j|s"td| d d S t j||r6|d ndtd  }t j| jt j|krzt j	| jrzt
| j| n8t j	| jst|d}| j }|| W 5 Q R X |fS )NzVocabulary path (z) should be a directory-rD   r   wb)ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   isfiler   openr   Zserialized_model_protowrite)r!   rP   rQ   Zout_vocab_filefiZcontent_spiece_modelr%   r%   r&   save_vocabulary   s     (
z!SpeechT5Tokenizer.save_vocabulary)r   r   r   r   FN)F)N)NF)N)$__name__
__module____qualname____doc__rZ   Zvocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesZmodel_input_namesr   r   r>   r   r    r+   propertyr.   r)   setterr5   r9   r<   r   r?   rA   rC   rI   intrL   boolrN   r   r`   __classcell__r%   r%   r#   r&   r   0   sN   *      




    
 r   )rd   rT   shutilr   typingr   r   r   r   r   Zsentencepiecer   Ztokenization_utilsr	   utilsr
   Znumber_normalizerr   Z
get_loggerra   rW   rZ   re   rf   r   r%   r%   r%   r&   <module>   s(   
	