U
    ,-e$                     @   s   d Z ddlZddlZddlZddlmZmZmZmZm	Z	m
Z
 ddlmZ ddlmZmZ e rfddlZeeZddiZdd	d
iiZd	diZdd ZG dd deZdS )zTokenization class for VITS.    N)AnyDictListOptionalTupleUnion   )PreTrainedTokenizer)is_phonemizer_availablelogging
vocab_filez
vocab.jsonzfacebook/mms-tts-engzChttps://huggingface.co/facebook/mms-tts-eng/resolve/main/vocab.jsoni   c                 C   s    t d}|| }|d k	}|S )Nz[^\x00-\x7F])recompilesearch)input_stringZnon_roman_patternmatchZhas_non_roman r   k/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/vits/tokenization_vits.pyhas_non_roman_characters/   s    

r   c                	       s   e Zd ZdZeZeZeZ	ddgZ
d$dd	 fd
dZedd Zdd Zdd Zdd Zd%eeee eeeeef f dddZeee dddZee edddZdd Zdd  Zd&eee eee df d!d"d#Z  ZS )'VitsTokenizera  
    Construct a VITS tokenizer. Also supports MMS-TTS.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        language (`str`, *optional*):
            Language identifier.
        add_blank (`bool`, *optional*, defaults to `True`):
            Whether to insert token id 0 in between the other tokens.
        normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the input text by removing all casing and punctuation.
        phonemize (`bool`, *optional*, defaults to `True`):
            Whether to convert the input text into phonemes.
        is_uroman (`bool`, *optional*, defaults to `False`):
            Whether the `uroman` Romanizer needs to be applied to the input text prior to tokenizing.
    Z	input_idsZattention_mask<pad><unk>NTF)returnc	              
      s~   t |dd}
t|
| _W 5 Q R X dd | j D | _|| _|| _|| _|| _	|| _
t jf |||||||d|	 d S )Nutf-8encodingc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>c   s      z*VitsTokenizer.__init__.<locals>.<dictcomp>)	pad_token	unk_tokenlanguage	add_blank	normalize	phonemize	is_uroman)openjsonloadencoderitemsdecoderr"   r#   r$   r%   r&   super__init__)selfr   r    r!   r"   r#   r$   r%   r&   kwargsZvocab_handle	__class__r   r   r.   T   s&    zVitsTokenizer.__init__c                 C   s
   t | jS N)lenr*   r/   r   r   r   
vocab_sizev   s    zVitsTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r   )Zconvert_ids_to_tokens)r   ir5   r   r   r   {   s      z+VitsTokenizer.get_vocab.<locals>.<dictcomp>)ranger6   updateadded_tokens_encoder)r/   Zvocabr   r5   r   	get_vocabz   s    zVitsTokenizer.get_vocabc                 C   s   t | j t | j  }d}d}|t|k rd}|D ]8}|||t|  |kr8||7 }|t|7 }d} qrq8|s$|||  7 }|d7 }q$|S )zfLowercase the input string, respecting any special token ids that may be part or entirely upper-cased. r   FT   )listr*   keysr:   r4   lower)r/   r   Zall_vocabularyfiltered_textr7   found_matchwordr   r   r   normalize_text   s    
zVitsTokenizer.normalize_textc                 C   s   | j dkr|dd}|S )z4Special treatment of characters in certain languagesZronu   țu   ţ)r"   replace)r/   textr   r   r   _preprocess_char   s    
zVitsTokenizer._preprocess_char)rF   is_split_into_wordsr$   r   c                    s   |dk	r|n j }|r  |} |}t|rB jrBtd  jr|t sVt	dt
j|dddddd}tdd	|}n$|rd
tt fdd| }||fS )a  
        Performs any necessary transformations before tokenization.

        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
        `kwargs` at the end of the encoding process to be sure all the arguments have been used.

        Args:
            text (`str`):
                The text to prepare.
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize.
            normalize (`bool`, *optional*, defaults to `None`):
                Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
                trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
                text consists only of lower-case characters.
            kwargs (`Dict[str, Any]`, *optional*):
                Keyword arguments to use for the tokenization.

        Returns:
            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
        NzText to the tokenizer contains non-Roman characters. Ensure the `uroman` Romanizer is applied to the text prior to passing it to the tokenizer. See `https://github.com/isi-nlp/uroman` for details.zEPlease install the `phonemizer` Python package to use this tokenizer.zen-usZespeakT)r"   backendstripZpreserve_punctuationZwith_stressz\s+ r<   c                    s
   |  j kS r3   )r*   )charr5   r   r   <lambda>       z8VitsTokenizer.prepare_for_tokenization.<locals>.<lambda>)r$   rD   rG   r   r&   loggerwarningr%   r
   ImportError
phonemizerr   subjoinr>   filterrJ   )r/   rF   rH   r$   r0   rA   r   r5   r   prepare_for_tokenization   s.    

 z&VitsTokenizer.prepare_for_tokenization)rF   r   c                 C   s@   t |}| jr<| dgt|d d  }||ddd< |}|S )z]Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters.r      r=   N)r>   r#   _convert_id_to_tokenr4   )r/   rF   tokensZinterspersedr   r   r   	_tokenize   s    zVitsTokenizer._tokenize)rY   r   c                 C   s*   | j r t|dkr |dd d }d|S )Nr=   rW   r<   )r#   r4   rT   )r/   rY   r   r   r   convert_tokens_to_string   s    z&VitsTokenizer.convert_tokens_to_stringc                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)r*   getr!   )r/   tokenr   r   r   _convert_token_to_id   s    z"VitsTokenizer._convert_token_to_idc                 C   s   | j |S )z=Converts an index (integer) in a token (str) using the vocab.)r,   r\   )r/   indexr   r   r   rX      s    z"VitsTokenizer._convert_id_to_token)save_directoryfilename_prefixr   c              	   C   s   t j|s"td| d d S t j||r6|d ndtd  }t|ddd$}|t	j
| jd	d
ddd  W 5 Q R X |fS )NzVocabulary path (z) should be a directory-r<   r   wr   r   rW   TF)indent	sort_keysensure_ascii
)ospathisdirrO   errorrT   VOCAB_FILES_NAMESr'   writer(   dumpsr*   )r/   r`   ra   r   fr   r   r   save_vocabulary   s     (zVitsTokenizer.save_vocabulary)r   r   NTTTF)FN)N) __name__
__module____qualname____doc__rl   Zvocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesZmodel_input_namesr.   propertyr6   r;   rD   rG   strboolr   r   r   r   rV   r   rZ   r[   r^   rX   r   rp   __classcell__r   r   r1   r   r   9   sB          "
     <r   )rt   r(   rh   r   typingr   r   r   r   r   r   Ztokenization_utilsr	   utilsr
   r   rR   Z
get_loggerrq   rO   rl   ru   rv   r   r   r   r   r   r   <module>   s(    
  
