U
    9%eyN                     @   s   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlZddlmZ ddlmZ ddlmZ erdd	lmZ dd
lmZ eeZddiZdddddddiZddddddZdZG dd deZdS )z! Tokenization class for model T5.    N)copyfile)TYPE_CHECKINGAnyDictListOptionalTuple   )import_protobuf)PreTrainedTokenizer)
AddedToken)	TextInput)logging
vocab_filezspiece.modelz9https://huggingface.co/t5-small/resolve/main/spiece.modelz8https://huggingface.co/t5-base/resolve/main/spiece.modelz9https://huggingface.co/t5-large/resolve/main/spiece.modelz6https://huggingface.co/t5-3b/resolve/main/spiece.modelz7https://huggingface.co/t5-11b/resolve/main/spiece.model)zt5-smallzt5-basezt5-largezt5-3bzt5-11bi   u   ▁c                       s  e Zd ZdZeZeZeZ	ddgZ
d9eeeef  dd	 fd
dZd:ddZedd Zedd Zdd Zd;ee eee  eee d fddZdd Zdd Zee ee dddZd<ee eee  ee dd d!Zd=ee eee  ee dd"d#Zd$d% Zd&d' Z d>d(ee d) fd*d+Z!ed,d- Z"d.d/ Z#d0d1 Z$d2d3 Z%d4d5 Z&d?eee e'e d6d7d8Z(  Z)S )@T5Tokenizera  
    Construct a T5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 100):
           Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be
            retrieved by calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids
            method
         additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
        legacy (`bool`, *optional*):
            Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
            and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
            example:

            - `legacy=True`:
            ```python
            >>> from transformers import T5Tokenizer

            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
            >>> tokenizer.encode("Hello <extra_id_0>.")
            [8774, 32099, 3, 5, 1]
            ```
            - `legacy=False`:
            ```python
            >>> from transformers import T5Tokenizer

            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
            [8774, 32099, 5, 1]
            ```
            Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    Z	input_idsZattention_mask</s><unk><pad>d   N)sp_model_kwargsreturnc	              
      sv  t |ddd}t |ddd}t |ddd}|d kr6i n|| _|| _|| _tjf | j| _| j| |d k	rdd |D }
|dkr|t|
krt	d| d| dnd	d t
|D }
|
}i | _t
t|
D ]6}t d
| dddddd| jt| jd | | < q|d kr(td| j d d}|| _| |	dd| _|| _|| _t jf |||||| j|d|	 d S )NT)rstriplstripc                 S   s   g | ]}d t |kr|qS )
<extra_id_)str).0x r   e/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/t5/tokenization_t5.py
<listcomp>   s      z(T5Tokenizer.__init__.<locals>.<listcomp>r   zBoth extra_ids (z!) and additional_special_tokens (zk) are provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensc                 S   s   g | ]}d | dqS )r   >r   r   ir   r   r   r      s     r   r    )Zsingle_wordr   r   special   z2You are using the default legacy behaviour of the a`  . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565	from_slowF)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensr   legacy)r   r   r   Z
_extra_idsspmSentencePieceProcessorsp_modelLoadlen
ValueErrorrangeZ_added_tokens_decoderloggerZwarning_once	__class__r+   get_spm_processorpopsuper__init__)selfr   r&   r'   r(   r)   r*   r   r+   kwargsZextra_tokensr"   r4   r   r   r8      sZ    
     
zT5Tokenizer.__init__Fc              	   C   s   t jf | j}| js|r(|| j |S t| jdZ}| }td| j	j
 d}|j|}| }d|_|j| | }|| W 5 Q R X |S )NrbzThe new behaviour of z (with `self.legacy = False`)F)r,   r-   r   r+   r/   r   openreadr
   r4   __name__Z
ModelProtoZ
FromStringZNormalizerSpecZadd_dummy_prefixnormalizer_specZ	MergeFromZSerializeToStringZLoadFromSerializedProto)r9   r%   Z	tokenizerfr.   Z	model_pb2modelr@   r   r   r   r5      s    
zT5Tokenizer.get_spm_processorc                 C   sZ   | t jkrVt j|  }|d k	r(||kr(|S |d krVtd| d|  d| d| d	t |S )NzGThis tokenizer was incorrectly instantiated with a model max length of z which will be corrected in Transformers v5.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on z( automatically truncating your input to zM when padding/encoding.
- If you want to encode/pad to sequences longer than z you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.)r   max_model_input_sizeswarningswarnFutureWarning)Zpretrained_model_name_or_pathZmax_model_lengthZinit_max_model_lengthZdeprecated_max_model_lengthr   r   r   !_eventually_correct_t5_max_length   s    

	z-T5Tokenizer._eventually_correct_t5_max_lengthc                 C   s
   | j  S N)r.   Zget_piece_sizer9   r   r   r   
vocab_size   s    zT5Tokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r   )Zconvert_ids_to_tokensr!   rI   r   r   
<dictcomp>  s      z)T5Tokenizer.get_vocab.<locals>.<dictcomp>)r2   rJ   updateZadded_tokens_encoder)r9   Zvocabr   rI   r   	get_vocab  s    zT5Tokenizer.get_vocab)token_ids_0token_ids_1already_has_special_tokensr   c                    sZ   |rt  j||ddS |dkr2dgt| dg S dgt| dg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rN   rO   rP   Nr   r$   )r7   get_special_tokens_maskr0   )r9   rN   rO   rP   r;   r   r   rQ     s      z#T5Tokenizer.get_special_tokens_maskc                 C   s   t ttdd | jS )Nc                 S   s   t td| d k	S )Nz<extra_id_\d+>)boolresearch)r   r   r   r   <lambda>$      z1T5Tokenizer.get_sentinel_tokens.<locals>.<lambda>)listsetfilterr*   rI   r   r   r   get_sentinel_tokens"  s    zT5Tokenizer.get_sentinel_tokensc                    s    fdd   D S )Nc                    s   g | ]}  |qS r   )Zconvert_tokens_to_ids)r   tokenrI   r   r   r   (  s     z6T5Tokenizer.get_sentinel_token_ids.<locals>.<listcomp>)rZ   rI   r   rI   r   get_sentinel_token_ids'  s    z"T5Tokenizer.get_sentinel_token_ids)	token_idsr   c                 C   sB   t |dkr2|d | jkr2td| j d |S || jg S dS )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.N)r0   eos_token_idrD   rE   r&   )r9   r]   r   r   r   _add_eos_if_not_present*  s    z#T5Tokenizer._add_eos_if_not_present)rN   rO   r   c                 C   s<   | j g}|dkr"t|| dg S t|| | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )r_   r0   )r9   rN   rO   Zeosr   r   r   $create_token_type_ids_from_sequences5  s    z0T5Tokenizer.create_token_type_ids_from_sequencesc                 C   s,   |  |}|dkr|S |  |}|| S dS )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)r`   )r9   rN   rO   r   r   r    build_inputs_with_special_tokensK  s
    

z,T5Tokenizer.build_inputs_with_special_tokensc                 C   s   | j  }d |d< |S )Nr.   )__dict__copy)r9   stater   r   r   __getstate__e  s    
zT5Tokenizer.__getstate__c                 C   s8   || _ t| dsi | _tjf | j| _| j| j d S )Nr   )rc   hasattrr   r,   r-   r.   r/   r   )r9   dr   r   r   __setstate__j  s
    
zT5Tokenizer.__setstate__r   )textr   c                    st   | j st|dkr"t j|f|S t jt|td f|}t|dkrp|d tkrp|d | jkrp|dd }|S )z
        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
        first token is special.
        r    r$   N)r+   r0   r7   tokenizeSPIECE_UNDERLINEreplaceall_special_tokens)r9   rj   Zadd_special_tokensr:   tokensr;   r   r   rl   u  s    &zT5Tokenizer.tokenizec                 C   s   t | jt| jS rH   )r0   r.   encoder   r'   rI   r   r   r   unk_token_length  s    zT5Tokenizer.unk_token_lengthc                 K   s^   | j j|td}| js$|tdfs(|S | j j| j| td}t|| jkrZ|| jd S |S )u(  
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        )Zout_typerk   N)	r.   rq   r   r+   
startswithrm   r'   r0   rr   )r9   rj   r:   rp   r   r   r   	_tokenize  s
    
zT5Tokenizer._tokenizec                 C   s   | j |S )z0Converts a token (str) in an id using the vocab.)r.   Zpiece_to_id)r9   r[   r   r   r   _convert_token_to_id  s    z T5Tokenizer._convert_token_to_idc                 C   s   | j |}|S )z=Converts an index (integer) in a token (str) using the vocab.)r.   Z	IdToPiece)r9   indexr[   r   r   r   _convert_id_to_token  s    z T5Tokenizer._convert_id_to_tokenc                 C   s   g }|d  t|d< d}d}|D ]F}|| jkrZ|s<|d7 }|| j|| 7 }d}g }q"|| d}q"|| j|7 }| S )z:Converts a sequence of tokens (string) in a single string.r    Frk   T)r   rm   ro   r.   decodeappendstrip)r9   rp   Zcurrent_sub_tokensZ
out_stringZprev_is_specialr[   r   r   r   convert_tokens_to_string  s    

z$T5Tokenizer.convert_tokens_to_string)save_directoryfilename_prefixr   c              	   C   s   t j|s"td| d d S t j||r6|d ndtd  }t j| jt j|krzt j	| jrzt
| j| n8t j	| jst|d}| j }|| W 5 Q R X |fS )NzVocabulary path (z) should be a directory-rx   r   wb)ospathisdirr3   errorjoinVOCAB_FILES_NAMESabspathr   isfiler   r=   r.   Zserialized_model_protowrite)r9   r}   r~   Zout_vocab_filefiZcontent_spiece_modelr   r   r   save_vocabulary  s     (
zT5Tokenizer.save_vocabulary)r   r   r   r   NNN)F)NF)N)N)F)N)*r?   
__module____qualname____doc__r   Zvocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESrC   Zmodel_input_namesr   r   r   r   r8   r5   staticmethodrG   propertyrJ   rM   r   intrR   rQ   rZ   r\   r`   ra   rb   rf   ri   rl   rr   rt   ru   rw   r|   r   r   __classcell__r   r   r;   r   r   ?   sn   K       F


    
   
  

r   ) r   r   rS   rD   shutilr   typingr   r   r   r   r   r   Zsentencepiecer,   Zconvert_slow_tokenizerr
   Ztokenization_utilsr   Ztokenization_utils_baser   r   utilsr   Z
get_loggerr?   r3   r   r   r   rm   r   r   r   r   r   <module>   s<    
