U
    ,-eT                     @   sV   d Z ddlmZmZmZmZ ddlmZmZ ddl	m
Z
 e
eZG dd deZdS )	z" Tokenization class for Perceiver.    )DictListOptionalTuple   )
AddedTokenPreTrainedTokenizer)loggingc                       s   e Zd ZdZddgZd&dd fddZeeef dddZ	e
dd Zd'ee eee  eee d fddZd(ee eee  ee dddZeee dddZdd Zdd  Zd!d" Zd)eee ee d#d$d%Z  ZS )*PerceiverTokenizeraS  
    Construct a Perceiver tokenizer. The Perceiver simply uses raw bytes utf-8 encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        bos_token (`str`, *optional*, defaults to `"[BOS]"`):
            The BOS token (reserved in the vocab, but not actually used).
        eos_token (`str`, *optional*, defaults to `"[EOS]"`):
            The end of sequence token (reserved in the vocab, but not actually used).

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The MASK token, useful for masked language modeling.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The CLS token (reserved in the vocab, but not actually used).
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from two sequences.

    Z	input_idsZattention_mask[PAD][BOS][EOS][MASK][CLS][SEP]   N)returnc           	   
      s   t |trt|dddn|}t |tr4t|dddn|}t |trPt|dddn|}t |trlt|dddn|}t |trt|dddn|}t |trt|dddn|}d| _||||||d| _t| j| _t jf |||||||d| d S )NF)lstriprstrip   )r         r         )	pad_token	bos_token	eos_token
mask_token	cls_token	sep_tokenmodel_max_length)	
isinstancestrr   _utf_vocab_sizeZ_added_tokens_decoderlen_num_special_tokenssuper__init__)	selfr   r   r   r   r   r   r    kwargs	__class__ u/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/perceiver/tokenization_perceiver.pyr'   <   s4    zPerceiverTokenizer.__init__c                 C   s:   i }t | jD ]}t|}|| j ||< q|| j |S N)ranger#   chrr%   updateadded_tokens_encoder)r(   Zvocabitokenr,   r,   r-   	get_vocabe   s    zPerceiverTokenizer.get_vocabc                 C   s   | j S r.   )r#   )r(   r,   r,   r-   
vocab_sizem   s    zPerceiverTokenizer.vocab_sizeF)token_ids_0token_ids_1already_has_special_tokensr   c                    sf   |rt  j||ddS |dkr8dgdgt|  dg S dgdgt|  dg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r7   r8   r9   Nr   r   )r&   get_special_tokens_maskr$   )r(   r7   r8   r9   r*   r,   r-   r:   q   s      z*PerceiverTokenizer.get_special_tokens_mask)r7   r8   r   c                 C   s@   |dkr| j g| | jg S | j g| | jg | | jg S dS )af  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks. A sequence has the
        following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)Zcls_token_idZsep_token_id)r(   r7   r8   r,   r,   r-    build_inputs_with_special_tokens   s    z3PerceiverTokenizer.build_inputs_with_special_tokens)textr   c                 C   s   dd | dD }|S )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsc                 S   s   g | ]}t |qS r,   )r0   ).0r3   r,   r,   r-   
<listcomp>   s     z0PerceiverTokenizer._tokenize.<locals>.<listcomp>utf-8)encode)r(   r<   tokensr,   r,   r-   	_tokenize   s    zPerceiverTokenizer._tokenizec                 C   s&   t |dkr| j}nt|| j }|S )z0Converts a token (str) in an id using the vocab.r   )r$   Zunk_token_idordr%   )r(   r4   Ztoken_idr,   r,   r-   _convert_token_to_id   s    z'PerceiverTokenizer._convert_token_to_idc                 C   s   t || j }|S )z=Converts an index (integer) in a token (str) using the vocab.)r0   r%   )r(   indexr4   r,   r,   r-   _convert_id_to_token   s    z'PerceiverTokenizer._convert_id_to_tokenc                 C   sP   d}|D ]4}|| j kr&t|d}ntt|g}||7 }q|jddd}|S )z:Converts a sequence of tokens (string) in a single string.    r?   replace)errors)r2   r"   r@   bytesrC   decode)r(   rA   bstringr4   Z
tok_stringstringr,   r,   r-   convert_tokens_to_string   s    

z+PerceiverTokenizer.convert_tokens_to_string)save_directoryfilename_prefixr   c                 C   s   dS )Nr,   r,   )r(   rO   rP   r,   r,   r-   save_vocabulary   s    z"PerceiverTokenizer.save_vocabulary)r   r   r   r   r   r   r   )NF)N)N)__name__
__module____qualname____doc__Zmodel_input_namesr'   r   r"   intr5   propertyr6   r   r   boolr:   r;   rB   rD   rF   rN   r   rQ   __classcell__r,   r,   r*   r-   r
      sD          
)
    
   
r
   N)rU   typingr   r   r   r   Ztokenization_utilsr   r   utilsr	   Z
get_loggerrR   loggerr
   r,   r,   r,   r-   <module>   s
   
