U
    ,-e8                     @   s$  d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZmZmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZmZ ddlmZmZm Z m!Z! e!"e#Z$dZ%d	Z&d
Z'G dd dZ(dd Z)dd Z*dd Z+dd Z,dd Z-e
e. e.dddZ/e eG dd deZ0dS )z
 Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see
 tokenization_utils_fast.py
    N)OrderedDict)AnyDictListOptionalTupleUnionoverload   )ENCODE_KWARGS_DOCSTRING'ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRINGINIT_TOKENIZER_DOCSTRING
AddedTokenBatchEncodingEncodedInputEncodedInputPairPreTokenizedInputPreTokenizedInputPairPreTrainedTokenizerBase	TextInputTextInputPairTruncationStrategy)PaddingStrategy
TensorTypeadd_end_docstringsloggingzspecial_tokens_map.jsonzadded_tokens.jsonztokenizer_config.jsonc                   @   sB   e Zd ZdZdd ZedddZeee ddd	Zd
d Z	dS )Triez
    Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass
    Loose reference https://en.wikipedia.org/wiki/Trie
    c                 C   s   i | _ t | _d S N)dataset_tokensself r#   `/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/tokenization_utils.py__init__:   s    zTrie.__init__)wordc                 C   sP   |sdS | j | | j}|D ]$}||kr2|| p4i ||< || }qd|d< dS )uy  
        Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.
        The special key `""` is used to represent termination.

        This function is idempotent, adding twice the same word will leave the trie unchanged

        Example:

        ```python
        >>> trie = Trie()
        >>> trie.add("Hello 友達")
        >>> trie.data
        {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}

        >>> trie.add("Hello")
        >>> trie.data
        {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
        ```
        Nr
    )r    addr   )r"   r&   refcharr#   r#   r$   r(   >   s    
zTrie.addtextreturnc                 C   s  t  }dg}d}t|D ]x\}}|r0||k r0qt }d}| D ]\}	}
d|
kr,| D ]\}}||	krv qn"||	k r|d }|d }n|}|}|t|k r|| nd}d|kr|}	|}|}||kr^|| }|d7 }d|kr|}	|}|}|t|kr q^|| }qq^||	 || d} qTqB||
krH|
| }
|
||	< qB||	 qB|r`i }n|D ]}	||	= qd||kr|| jkr| j| ||< q| D ]6\}	}
d|
krt|}||	 ||  qԐq| ||S )a\  
        Will look for the words added to the trie within `text`. Output is the original string splitted along the
        boundaries of the words found.

        This trie will match the longest possible word first !

        Example:

        ```python
        >>> trie = Trie()
        >>> trie.split("[CLS] This is a extra_id_100")
        ["[CLS] This is a extra_id_100"]

        >>> trie.add("[CLS]")
        >>> trie.add("extra_id_1")
        >>> trie.add("extra_id_100")
        >>> trie.split("[CLS] This is a extra_id_100")
        ["[CLS]", " This is a ", "extra_id_100"]
        ```
        r   Fr'   r
   NT)	r   	enumerater   itemslenappendr(   r   cut_text)r"   r,   ZstatesoffsetsskipcurrentZcurrent_charZ	to_removeresetstartZtrie_pointerZ	lookstartZlooktrie_pointerZlookahead_indexendZ	next_charr#   r#   r$   split]   sl    !
	









z
Trie.splitc                 C   sZ   | t| g }d}|D ]:}||kr4td qn
||kr>q| |||  |}q|S )Nr   zbThere was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it anyway.)r1   r0   loggererror)r"   r,   r3   tokensr7   r8   r#   r#   r$   r2      s    zTrie.cut_textN)
__name__
__module____qualname____doc__r%   strr(   r   r9   r2   r#   r#   r#   r$   r   4   s    r   c                 C   s>   | dks | dks | dks | dkr$dS t | }|dkr:dS dS )z0Checks whether `char` is a whitespace character. 	
TZsF)unicodedatacategoryr*   catr#   r#   r$   _is_whitespace  s     
rK   c                 C   s8   | dks| dks| dkrdS t | }|dr4dS dS )z-Checks whether `char` is a control character.rC   rD   rE   FCT)rG   rH   
startswithrI   r#   r#   r$   _is_control  s    

rN   c                 C   sh   t | }|dkr|dksH|dkr(|dksH|dkr8|dksH|dkrL|dkrLd	S t| }|d
rdd	S dS )z1Checks whether `char` is a punctuation character.!   /   :   @   [   `   {   ~   TPF)ordrG   rH   rM   )r*   cprJ   r#   r#   r$   _is_punctuation'  s    @

rZ   c                 C   s$   | d }t t|t|B t|B S )zcChecks whether the last character in text is one of a punctuation, control or whitespace character.boolrN   rZ   rK   )r,   	last_charr#   r#   r$   _is_end_of_word6  s    r_   c                 C   s$   | d }t t|t|B t|B S )zdChecks whether the first character in text is one of a punctuation, control or whitespace character.r   r\   )r,   
first_charr#   r#   r$   _is_start_of_word<  s    ra   )
token_list	new_tokenc                 C   s8   t | |}|t| k r(| | |kr(dS | || dS )zm
    Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted.
    N)bisectbisect_leftr0   insert)rb   rc   Zinsertion_idxr#   r#   r$   !_insert_one_token_to_ordered_listB  s    rg   c                       s  e Zd ZdZ fddZeedddZeedddZ	ee
eef dd	d
Zee
eef dddZeje
eeeef f e
eef dddZe
eef dddZdd ZdHeee ee f eedddZg fee dddZdIeedddZeee dddZd d! Zeeee f eeee f d"d#d$Zd%d& Zd'd( Zd)d*eje j!d)d+dd)d)d)d)ddddd*feee"e#f eeee"e#f  eee ee eeee eeee$f  ee ee eeeeee%d,d-d.Z&d*eje j!d)d+dd)d)d)d)ddddd*feee ee' ee" ee( ee# ee) f eee ee eeee eeee$f  ee ee eeeeee%d/d0d1Z*e+e,e-d*eje j!d)d+d)d)d)d)dddd*feee(e.ee d)f f  eee ee eee ee ee ee eeeee%d2d3d4Z/dJeee.ee
ee0f f d5d6d7Z1dKeee eee d8 fd9d:Z2e3dLeeed;d<d=Z4e3dMee eee d;d>d=Z4dNeeee f eeeee f d;d?d=Z4eed@dAdBZ5ee ed"dCdDZ6dOee eeeedEdFdGZ7  Z8S )PPreTrainedTokenizera  
    Base class for all slow tokenizers.

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading
    pretrained tokenizers as well as adding tokens to the vocabulary.

    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    c                    sp   t  jf | t | _t| ds&i | _d|kr@| j|d dd | j D | _	| j
| jdd d| _d S )N_added_tokens_decoderadded_tokens_decoderc                 S   s   i | ]\}}|j |qS r#   content.0vkr#   r#   r$   
<dictcomp>j  s      z0PreTrainedTokenizer.__init__.<locals>.<dictcomp>T)special_tokensF)superr%   r   tokens_triehasattrri   updategetr/   _added_tokens_encoder_add_tokensZall_special_tokens_extended_decode_use_source_tokenizer)r"   kwargs	__class__r#   r$   r%   ]  s    
zPreTrainedTokenizer.__init__)r-   c                 C   s   dS )NFr#   r!   r#   r#   r$   is_fastr  s    zPreTrainedTokenizer.is_fastc                 C   s   t dS )zP
        `int`: Size of the base vocabulary (without the added tokens).
        NNotImplementedErrorr!   r#   r#   r$   
vocab_sizev  s    zPreTrainedTokenizer.vocab_sizec                 C   s    dd t | j dd dD S )z
        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
        c                 S   s   i | ]\}}|j |qS r#   rk   rm   r#   r#   r$   rq     s      z<PreTrainedTokenizer.added_tokens_encoder.<locals>.<dictcomp>c                 S   s   | d S Nr   r#   itemr#   r#   r$   <lambda>      z:PreTrainedTokenizer.added_tokens_encoder.<locals>.<lambda>key)sortedri   r/   r!   r#   r#   r$   added_tokens_encoder}  s    z(PreTrainedTokenizer.added_tokens_encoderc                 C   s   t t| j dd dS )z
        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

        Returns:
            `Dict[str, int]`: The added tokens.
        c                 S   s   | d S r   r#   r   r#   r#   r$   r     r   z:PreTrainedTokenizer.added_tokens_decoder.<locals>.<lambda>r   )dictr   ri   r/   r!   r#   r#   r$   rj     s    z(PreTrainedTokenizer.added_tokens_decoder)valuer-   c              	   C   s   |  D ]r\}}t|ttfr(t|tsPtd|j|jf dttttf f t|trbt|n|| j|< || j	t|< qd S )Nz;The provided `added_tokens_decoder` has an element of type z, should be a dict of )
r/   
isinstancerA   r   int
ValueErrorr}   r   ri   rx   )r"   r   indextokenr#   r#   r$   rj     s    "c                 C   s   | j S )aX  
        Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from
        the fast call because for now we always add the tokens even if they are already in the vocabulary. This is
        something we should change.

        Returns:
            `Dict[str, int]`: The added tokens.
        )rx   r!   r#   r#   r$   get_added_vocab  s    	z#PreTrainedTokenizer.get_added_vocabc                 C   s   t t|   S )z
        Size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because otherwise if
        there is a hole in the vocab, we will add tokenizers at a wrong index.
        )r0   r   	get_vocabkeysr!   r#   r#   r$   __len__  s    zPreTrainedTokenizer.__len__F)
new_tokensrr   r-   c                 C   sN  d}|dkr|S |    }t|}|D ]}t|ttfsVtd| dt| dt|dkrdq(t|trt|| ddd}|rd|_|| j	krq(|js|j
rt| d	r| jr|j |_|j|kr|| }|||j< |d
7 }n
||j }|jrt|| jkr| j| || j	|< || j|j< | jr(td| d q(|   |S )am  
        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
        it with indices starting from length of the current vocabulary. Special tokens are sometimes already in the
        vocab which is why they have to be handled specifically.

        Args:
            new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):
                Token(s) to add in vocabulary. A token is counted as added if it's not already in the vocabulary
                (tested by checking if the tokenizer assign the index of the `unk_token` to them). If a token is part
                of the vocabulary then we simply mark this token as an `AddedToken` which allows to control the
                stripping and normalization of this token. This is NOT possible in `tokenizers`.
            special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the tokens should be added as special tokens.

        Returns:
            `int`: The number of tokens actually added to the vocabulary.

        Examples:

        ```python
        # Let's see how to increase the vocabulary of Bert model and tokenizer
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        model = BertModel.from_pretrained("bert-base-uncased")

        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
        print("We have added", num_added_toks, "tokens")
        # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))
        ```r   NzToken z is not a string but a .r'   T)
normalizedrstriplstripdo_lower_caser
   zAdding z to the vocabulary)r   copyr0   r   rA   r   	TypeErrortypespecialri   r   ru   r   rl   lowerall_special_tokensZ_additional_special_tokensr1   rx   verboser:   info_update_trie)r"   r   rr   Zadded_tokensZcurrent_vocabZnew_idxr   Ztoken_indexr#   r#   r$   ry     s>    







zPreTrainedTokenizer._add_tokens)unique_no_split_tokensc                 C   sP   | j  D ]}|| jjkr
| j|j q
|D ]}|| jjkr.| j| q.d S r   )ri   valuesrt   r    r(   rl   )r"   r   r   r#   r#   r$   r     s    z PreTrainedTokenizer._update_trie)pairr-   c                 C   s    g }g }t | ||r|ndS )aG  
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        N)r0   Z build_inputs_with_special_tokens)r"   r   token_ids_0token_ids_1r#   r#   r$   num_special_tokens_to_add  s    z-PreTrainedTokenizer.num_special_tokens_to_addr+   c                 K   sF  | d| j}| j|f|\}}|r6td| d t| dr| jrdd | jD }|dd | j	 D 7 }dd	
| d
 d }t|dd |}|rg }|g}nt| j }| j|}t|D ]6\}}	|	|kr| j| j|	 d}
|dkr||d  nd}|t|d k r&||d  nd}t|
tr|
jrT|rT| ||d < |
jrr|rr| ||d < |
jr|r|d dkr||d   |	7  < d||< n<|
jr |r |d dkr |	||d   ||d < d||< qt|
 dt|
 qg }|D ]6}	|	sq
|	|kr.||	 n|| |	 q
|S )a"  
        Converts a string in a sequence of tokens, using the tokenizer.

        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
        (BPE/SentencePieces/WordPieces). Takes care of added tokens.

        Args:
            text (`str`):
                The sequence to be encoded.
            **kwargs (additional keyword arguments):
                Passed along to the model-specific `prepare_for_tokenization` preprocessing method.

        Returns:
            `List[str]`: The list of tokens.
        split_special_tokenszKeyword arguments z not recognized.r   c                 S   s   g | ]}t |qS r#   )reescapern   Zs_tokr#   r#   r$   
<listcomp>*  s     z0PreTrainedTokenizer.tokenize.<locals>.<listcomp>c                 S   s$   g | ]}|j s|jrt|jqS r#   )r   r   r   r   rl   r   r#   r#   r$   r   +  s    (|z)|z(.+?)c                 S   s   |   d p|   d  S )Nr   r
   )groupsr   )mr#   r#   r$   r   1  r   z.PreTrainedTokenizer.tokenize.<locals>.<lambda>Nr   r
   r[   rB   r'   zy cannot be tokenized because it was not properly added to the tokenizer. This means that it is not an `AddedToken` but a )popr   prepare_for_tokenizationr:   warningru   r   r   ri   r   joinr   subr   rx   r   rt   r9   r.   rw   r0   r   r   r   r   Zsingle_wordr   r   r1   extend	_tokenize)r"   r,   r{   r   Zescaped_special_tokspatternZno_split_tokenr<   ir   Ztok_extendedleftrightZtokenized_textr#   r#   r$   tokenize  sV    "


zPreTrainedTokenizer.tokenizec                 K   s   t dS )a  
        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).

        Do NOT take care of added tokens.
        Nr   )r"   r,   r{   r#   r#   r$   r   b  s    zPreTrainedTokenizer._tokenize)r<   r-   c                 C   sB   |dkrdS t |tr | |S g }|D ]}|| | q(|S )aT  
        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
        vocabulary.

        Args:
            tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).

        Returns:
            `int` or `List[int]`: The token id or list of token ids.
        N)r   rA   #_convert_token_to_id_with_added_vocr1   )r"   r<   idsr   r#   r#   r$   convert_tokens_to_idsk  s    

z)PreTrainedTokenizer.convert_tokens_to_idsc                 C   s*   |d krd S || j kr | j | S | |S r   )rx   _convert_token_to_idr"   r   r#   r#   r$   r     s
    

z7PreTrainedTokenizer._convert_token_to_id_with_added_vocc                 C   s   t d S r   r   r   r#   r#   r$   r     s    z(PreTrainedTokenizer._convert_token_to_idNTr   )r,   	text_pairadd_special_tokenspadding_strategytruncation_strategy
max_lengthstrideis_split_into_wordspad_to_multiple_ofreturn_tensorsreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthr   r-   c                    sf    fdd}|rt d||}|d k	r4||nd }j||||j|j|||	|
d||||||dS )Nc                    s   t | tr"j| f}|S t | ttfrt| dkrt | d tr rvttjfdd| D  }|S | S nRt | ttfrt| dkrt | d t	r| S  rt
d|  dnt
d|  dd S )Nr   c                 3   s$   | ]}j |fd di V  qdS r   TNr   rn   tr{   r"   r#   r$   	<genexpr>  s     zJPreTrainedTokenizer._encode_plus.<locals>.get_input_ids.<locals>.<genexpr>zInput z] is not valid. Should be a string or a list/tuple of strings when `is_split_into_words=True`.zW is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.r   rA   r   r   listtupler0   	itertoolschainr   r   r,   r<   r   r{   r"   r#   r$   get_input_ids  s&    

(
(

z7PreTrainedTokenizer._encode_plus.<locals>.get_input_idsa  return_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast. More information on available tokenizers at https://github.com/huggingface/transformers/pull/2674T)pair_idsr   padding
truncationr   r   r   r   prepend_batch_axisr   r   r   r   r   r   )r   prepare_for_modelr   )r"   r,   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r{   r   	first_ids
second_idsr#   r   r$   _encode_plus  s2    z PreTrainedTokenizer._encode_plus)batch_text_or_text_pairsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r-   c                    s    fdd}|rt dg }|D ]r}t|ttfsB|d  }}n* rdt|d ttfsd|d  }}n|\}}||}|d k	r||nd }|||f q$j|||||||||
||||	|d}t|S )Nc                    s   t | tr"j| f}|S t | ttfrt| dkrt | d tr rvttjfdd| D  }|S | S n4t | ttfrt| dkrt | d t	r| S t
dd S )Nr   c                 3   s$   | ]}j |fd di V  qdS r   r   r   r   r#   r$   r     s     zPPreTrainedTokenizer._batch_encode_plus.<locals>.get_input_ids.<locals>.<genexpr>z\Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.r   r   r   r#   r$   r     s    

(
(z=PreTrainedTokenizer._batch_encode_plus.<locals>.get_input_idszreturn_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast.r   )r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r1   _batch_prepare_for_modelr   )r"   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r{   r   Z	input_idsZids_or_pair_idsr   r   r   r   batch_outputsr#   r   r$   _batch_encode_plus  s@    z&PreTrainedTokenizer._batch_encode_plus)batch_ids_pairsr   r   r   r   r   r   r   r   r   r   r   r   r   r-   c                 C   s   i }|D ]h\}}| j |||tjj|j||dd|	|||dd|d}| D ]&\}}||kr`g ||< || | qHq| j||j|||
d}t||d}|S )a  
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens

        Args:
            batch_ids_pairs: list of tokenized input ids or input ids pairs
        NF)r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   )Ztensor_type)r   r   
DO_NOT_PADr   r/   r1   padr   )r"   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   outputsr   r   r#   r#   r$   r   /  sB    z,PreTrainedTokenizer._batch_prepare_for_model)r,   r   r-   c                 K   s   ||fS )a  
        Performs any necessary transformations before tokenization.

        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
        `kwargs` at the end of the encoding process to be sure all the arguments have been used.

        Args:
            text (`str`):
                The text to prepare.
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize. This is useful for NER or token classification.
            kwargs (`Dict[str, Any]`, *optional*):
                Keyword arguments to use for the tokenization.

        Returns:
            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
        r#   )r"   r,   r   r{   r#   r#   r$   r   p  s    z,PreTrainedTokenizer.prepare_for_tokenization)r   r   already_has_special_tokensr-   c                    sD   |r&|dk	rt dt j||ddS dg|r6t|ndt|  S )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of ids of the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                List of ids of the second sequence.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        NzYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.T)r   r   r   r   )r   rs   get_special_tokens_maskr0   )r"   r   r   r   r|   r#   r$   r     s      z+PreTrainedTokenizer.get_special_tokens_mask)r   skip_special_tokensr-   c                 C   s   d S r   r#   r"   r   r   r#   r#   r$   convert_ids_to_tokens  s    z)PreTrainedTokenizer.convert_ids_to_tokensc                 C   s   d S r   r#   r   r#   r#   r$   r     s    c                 C   s   t |tr*|| jkr | j| jS | |S g }|D ]J}t|}|rN|| jkrNq2|| jkrl|| j| j q2|| | q2|S )a  
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `List[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `List[str]`: The decoded token(s).
        )r   r   ri   rl   _convert_id_to_tokenall_special_idsr1   )r"   r   r   r<   r   r#   r#   r$   r     s    



)r   r-   c                 C   s   t d S r   r   )r"   r   r#   r#   r$   r     s    z(PreTrainedTokenizer._convert_id_to_tokenc                 C   s
   d |S )NrB   )r   )r"   r<   r#   r#   r$   convert_tokens_to_string  s    z,PreTrainedTokenizer.convert_tokens_to_string)	token_idsr   clean_up_tokenization_spacesspaces_between_special_tokensr-   c                    s  | dd _ j||d}t j t j  fdd jD B }g }g }	|D ]Z}
|rh|
 jkrhqT|
|kr|	r 	|	}t
|dkr|| g }	||
 qT|	|
 qT|	r| 	|	 |rd|}n
d|}|d k	r|n j}|r |}|S |S d S )	NZuse_source_tokenizerF)r   c                    s    h | ]}  | jkr|qS r#   )r   r   )rn   r   r!   r#   r$   	<setcomp>  s     z.PreTrainedTokenizer._decode.<locals>.<setcomp>r   rB   r'   )r   rz   r   r   rx   r   r   Zadditional_special_tokensr   r   r0   r1   r   r   Zclean_up_tokenization)r"   r   r   r   r   r{   Zfiltered_tokensZlegacy_added_tokensZ	sub_textsZcurrent_sub_textr   stringr,   Z
clean_textr#   r!   r$   _decode  s>     



zPreTrainedTokenizer._decode)F)F)F)NF)F)F)F)FNT)9r=   r>   r?   r@   r%   propertyr]   r~   r   r   r   rA   r   r   rj   setterr   r   r   r   ry   r   r   r   r   r   r   r   r   r   r   r   r   ZDO_NOT_TRUNCATEr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r	   r   r   r   r   __classcell__r#   r#   r|   r$   rh   O  sN  	*$EQ	(XUA             rh   )1r@   rd   r   r   rG   collectionsr   typingr   r   r   r   r   r   r	   Ztokenization_utils_baser   r   r   r   r   r   r   r   r   r   r   r   r   utilsr   r   r   r   Z
get_loggerr=   r:   ZSPECIAL_TOKENS_MAP_FILEZADDED_TOKENS_FILEZTOKENIZER_CONFIG_FILEr   rK   rN   rZ   r_   ra   rA   rg   rh   r#   r#   r#   r$   <module>   s,   $<
 \