U
    ,-e$                     @   s   U d Z ddlmZmZmZ ddlmZmZ ddlm	Z	 e	
eZddiZdZdZd	Zd
ZdZdZdZedededededediZeeef ed< dd e D Zeeef ed< G dd deZdS )z Tokenization classes for CANINE.    )DictListOptional   )
AddedTokenPreTrainedTokenizer)loggingznielsr/canine-s   i   i   i  i  i  i  z[CLS]z[SEP]z[BOS]z[MASK]z[PAD]z
[RESERVED]SPECIAL_CODEPOINTSc                 C   s   i | ]\}}||qS  r   ).0	codepointnamer   r   o/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/canine/tokenization_canine.py
<dictcomp>;   s      r   SPECIAL_CODEPOINTS_BY_NAMEc                       s,  e Zd ZdZeZeeeeeeeeee	ee
ddf fdd	ZeedddZd	d
 Zeee dddZeedddZeedddZdd Zd"ee eee  ee dddZd#ee eee  eee d fddZd$ee eee  ee dddZd%eee dd d!Z  ZS )&CanineTokenizera  
    Construct a CANINE tokenizer (i.e. a character splitter). It turns text into a sequence of characters, and then
    converts each character into its Unicode code point.

    [`CanineTokenizer`] inherits from [`PreTrainedTokenizer`].

    Refer to superclass [`PreTrainedTokenizer`] for usage examples and documentation concerning parameters.

    Args:
        model_max_length (`int`, *optional*, defaults to 2048):
                The maximum sentence length the model accepts.
    Fr	   c	                    s  t |trt|dddn|}t |tr4t|dddn|}t |trPt|dddn|}t |trlt|dddn|}t |trt|dddn|}t |trt|dddn|}i | _t D ]\}
}|
| j|< qdd | j D | _t| _t	| j| _
t jf ||||||||d|	 d S )NF)lstriprstripTc                 S   s   i | ]\}}||qS r   r   )r   r   r   r   r   r   r   i   s     z,CanineTokenizer.__init__.<locals>.<dictcomp>)	bos_token	eos_token	sep_token	cls_token	pad_token
mask_tokenadd_prefix_spacemodel_max_length)
isinstancestrr   Z_special_codepointsr
   itemsZ_special_codepoint_stringsUNICODE_VOCAB_SIZE_unicode_vocab_sizelenZ_num_special_tokenssuper__init__)selfr   r   r   r   r   r   r   r   kwargsr   r   	__class__r   r   r$   N   s4    	zCanineTokenizer.__init__)returnc                 C   s   | j S )N)r!   )r%   r   r   r   
vocab_size|   s    zCanineTokenizer.vocab_sizec                 C   s$   dd t | jD }|| j |S )Nc                 S   s   i | ]}t ||qS r   )chr)r   ir   r   r   r      s      z-CanineTokenizer.get_vocab.<locals>.<dictcomp>)ranger*   updateZadded_tokens_encoder)r%   Zvocabr   r   r   	get_vocab   s    zCanineTokenizer.get_vocab)textr)   c                 C   s   t |S )z5Tokenize a string (i.e. perform character splitting).)list)r%   r0   r   r   r   	_tokenize   s    zCanineTokenizer._tokenize)tokenr)   c                 C   s4   z
t |W S  tk
r.   td| dY nX dS )zaConverts a token (i.e. a Unicode character) in an id (i.e. its integer Unicode code point value).zinvalid token: ''N)ord	TypeError
ValueError)r%   r3   r   r   r   _convert_token_to_id   s    
z$CanineTokenizer._convert_token_to_id)indexr)   c                 C   sD   z|t krt | W S t|W S  tk
r>   td| Y nX dS )z
        Converts a Unicode code point (integer) in a token (str). In case it's a special code point, convert to
        human-readable format.
        zinvalid id: N)r
   r+   r6   r7   )r%   r9   r   r   r   _convert_id_to_token   s    

z$CanineTokenizer._convert_id_to_tokenc                 C   s
   d |S )N )join)r%   tokensr   r   r   convert_tokens_to_string   s    z(CanineTokenizer.convert_tokens_to_stringN)token_ids_0token_ids_1r)   c                 C   s4   | j g}| jg}|| | }|dk	r0||| 7 }|S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CANINE sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)sep_token_idcls_token_idr%   r?   r@   sepclsresultr   r   r    build_inputs_with_special_tokens   s    z0CanineTokenizer.build_inputs_with_special_tokens)r?   r@   already_has_special_tokensr)   c                    sT   |rt  j||ddS dgdgt|  dg }|dk	rP|dgt| dg 7 }|S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r?   r@   rH      r   N)r#   get_special_tokens_maskr"   )r%   r?   r@   rH   rF   r'   r   r   rJ      s      z'CanineTokenizer.get_special_tokens_maskc                 C   sH   | j g}| jg}t|| | dg }|dk	rD|t|| dg 7 }|S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A CANINE
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   NrI   )rA   rB   r"   rC   r   r   r   $create_token_type_ids_from_sequences   s    z4CanineTokenizer.create_token_type_ids_from_sequences)save_directoryfilename_prefixc                 C   s   dS )Nr   r   )r%   rL   rM   r   r   r   save_vocabulary   s    zCanineTokenizer.save_vocabulary)N)NF)N)N)__name__
__module____qualname____doc__&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesr+   CLSSEPPADMASKr$   propertyintr*   r/   r   r   r2   r8   r:   r>   r   rG   boolrJ   rK   rN   __classcell__r   r   r'   r   r   >   sN   .  
    
   
 r   N)rR   typingr   r   r   Ztokenization_utilsr   r   utilsr   Z
get_loggerrO   loggerrS   r    rV   rT   rU   ZBOSrW   ZRESERVEDr
   rY   r   __annotations__r   r   r   r   r   r   r   <module>   s:   
 
      "