U
    9%eI                     @   s   d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
 ddlZddlmZ ddlmZmZ ddlmZ erxdd	lmZ eeZd
diZddiddidZddiZdZd\ZZd\ZZ dZ!G dd deZ"dS )zTokenization classes for LLaMA.    N)copyfile)TYPE_CHECKINGAnyDictListOptionalTuple   )import_protobuf)
AddedTokenPreTrainedTokenizer)logging)	TextInput
vocab_fileztokenizer.modelz#hf-internal-testing/llama-tokenizerzWhttps://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.modelz]https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json)r   Ztokenizer_filei   u   ▁)z[INST]z[/INST])z<<SYS>>
z
<</SYS>>

a  You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.c                       s:  e Zd ZdZeZeZeZ	ddgZ
d2eeeef  d
 fddZedd Zd3ddZdd Zdd Zedd Zdd Zd4dee d fddZdd Zdd  Zd!d" Zd#d$ Zd5ee ee d%d&d'Zd6d(d)Zd7ee  eee   e!ee  d* fd+d,Z"d8ee  eee   ee  d-d.d/Z#ed0d1 Z$  Z%S )9LlamaTokenizera  
    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
    no padding token in the original model.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        legacy (`bool`, *optional*):
            Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
            and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
            example:

            - `legacy=True`:
            ```python
            >>> from transformers import T5Tokenizer

            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
            >>> tokenizer.encode("Hello <extra_id_0>.")
            [8774, 32099, 3, 5, 1]
            ```
            - `legacy=False`:
            ```python
            >>> from transformers import T5Tokenizer

            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
            [8774, 32099, 5, 1]
            ```
            Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.

    Z	input_idsZattention_mask<unk><s></s>NTF)sp_model_kwargsc                    s  |d kri n|| _ t|tr*t|dddn|}t|trFt|dddn|}t|trbt|dddn|}t|tr~t|dddn|}|d krtd| j d d}|| _|| _|| _	|| _
|
| _| |dd| _t jf ||||||| j |	|
||d| d S )NF)lstriprstripz2You are using the default legacy behaviour of the a`  . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565T	from_slow)	bos_token	eos_token	unk_token	pad_tokenadd_bos_tokenadd_eos_tokenr   clean_up_tokenization_spacesuse_default_system_promptspaces_between_special_tokenslegacy)r   
isinstancestrr   loggerZwarning_once	__class__r!   r   r   r   r   get_spm_processorpopsp_modelsuper__init__)selfr   r   r   r   r   r   r   r   r   r   r    r!   kwargsr%    k/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/llama/tokenization_llama.pyr*   h   s>    zLlamaTokenizer.__init__c                 C   s   t | jt| jS N)lenr(   encoder#   r   r+   r.   r.   r/   unk_token_length   s    zLlamaTokenizer.unk_token_lengthc              	   C   s   t jf | j}| js|r(|| j |S t| jdZ}| }td| j	j
 d}|j|}| }d|_|j| | }|| W 5 Q R X |S )NrbzThe new behaviour of z (with `self.legacy = False`)F)spmSentencePieceProcessorr   r!   Loadr   openreadr
   r%   __name__Z
ModelProtoZ
FromStringZNormalizerSpecZadd_dummy_prefixnormalizer_specZ	MergeFromZSerializeToStringLoadFromSerializedProto)r+   r   Z	tokenizerfr(   Z	model_pb2modelr<   r.   r.   r/   r&      s    
z LlamaTokenizer.get_spm_processorc                 C   s$   | j  }d |d< | j |d< |S )Nr(   sp_model_proto)__dict__copyr(   serialized_model_proto)r+   stater.   r.   r/   __getstate__   s    
zLlamaTokenizer.__getstate__c                 C   s(   || _ tjf | j| _| j| j d S r0   )rA   r6   r7   r   r(   r=   r@   )r+   dr.   r.   r/   __setstate__   s    zLlamaTokenizer.__setstate__c                 C   s
   | j  S )zReturns vocab size)r(   Zget_piece_sizer3   r.   r.   r/   
vocab_size   s    zLlamaTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )zReturns vocab as a dictc                    s   i | ]}  ||qS r.   )Zconvert_ids_to_tokens).0ir3   r.   r/   
<dictcomp>   s      z,LlamaTokenizer.get_vocab.<locals>.<dictcomp>)rangerH   updateZadded_tokens_encoder)r+   Zvocabr.   r3   r/   	get_vocab   s    zLlamaTokenizer.get_vocabr   )textreturnc                    st   | j st|dkr"t j|f|S t jt|td f|}t|dkrp|d tkrp|d | jkrp|dd }|S )z
        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
        first token is special.
        r       N)r!   r1   r)   tokenizeSPIECE_UNDERLINEreplaceall_special_tokens)r+   rO   Zadd_special_tokensr,   tokensr-   r.   r/   rS      s    &zLlamaTokenizer.tokenizec                 K   s^   | j j|td}| js$|tdfs(|S | j j| j| td}t|| jkrZ|| jd S |S )u(  
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        )Zout_typerQ   N)	r(   r2   r#   r!   
startswithrT   r   r1   r4   )r+   rO   r,   rW   r.   r.   r/   	_tokenize   s
    
zLlamaTokenizer._tokenizec                 C   s   | j |S )z0Converts a token (str) in an id using the vocab.)r(   Zpiece_to_id)r+   tokenr.   r.   r/   _convert_token_to_id   s    z#LlamaTokenizer._convert_token_to_idc                 C   s   | j |}|S )z=Converts an index (integer) in a token (str) using the vocab.)r(   Z	IdToPiece)r+   indexrZ   r.   r.   r/   _convert_id_to_token   s    z#LlamaTokenizer._convert_id_to_tokenc                 C   s   |d  tr"|d dd |d< g }d}d}t|D ]X\}}|| jkr|sb|dkrb| jrb|d7 }|| j|| 7 }d}g }q6|| d}q6|| j|7 }|S )z:Converts a sequence of tokens (string) in a single string.r   rR   N FrQ   T)rX   rT   	enumeraterV   r!   r(   decodeappend)r+   rW   Zcurrent_sub_tokensZ
out_stringZprev_is_specialrJ   rZ   r.   r.   r/   convert_tokens_to_string   s     

z'LlamaTokenizer.convert_tokens_to_string)filename_prefixrP   c              	   C   s   t j|s"td| d dS t j||r6|d ndtd  }t j| jt j|krzt j	| jrzt
| j| n8t j	| jst|d}| j }|| W 5 Q R X |fS )a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        zVocabulary path (z) should be a directoryN-r^   r   wb)ospathisdirr$   errorjoinVOCAB_FILES_NAMESabspathr   isfiler   r9   r(   rC   write)r+   Zsave_directoryrc   Zout_vocab_filefiZcontent_spiece_modelr.   r.   r/   save_vocabulary  s     (
zLlamaTokenizer.save_vocabularyc                 C   sL   | j r| jgng }| jr | jgng }|| | }|d k	rH|| | | }|S r0   )r   bos_token_idr   eos_token_idr+   token_ids_0token_ids_1rq   rr   outputr.   r.   r/    build_inputs_with_special_tokens(  s    z/LlamaTokenizer.build_inputs_with_special_tokens)rt   ru   already_has_special_tokensrP   c                    s   |rt  j||ddS | jr"dgng }| jr2dgng }|dkrT|dgt|  | S |dgt|  | | dgt|  | S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rt   ru   rx   rR   Nr   )r)   get_special_tokens_maskr   r   r1   )r+   rt   ru   rx   rq   rr   r-   r.   r/   ry   3  s,      z&LlamaTokenizer.get_special_tokens_mask)rt   ru   rP   c                 C   s`   | j r| jgng }| jr | jgng }dgt|| |  }|dk	r\|dgt|| |  7 }|S )a  
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   NrR   )r   rq   r   rr   r1   rs   r.   r.   r/   $create_token_type_ids_from_sequencesX  s    z3LlamaTokenizer.create_token_type_ids_from_sequencesc                 C   s>   d}| d| jrdnd}t dd dd}| d	|}|S )
aZ  
        LLaMA uses [INST] and [/INST] to indicate user messages, and <<SYS>> and <</SYS>> to indicate system messages.
        Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict
        user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering
        rather than needing special tokens. The system message is partly 'embedded' in the first user message, which
        results in an unusual token ordering when it is present. This template should definitely be changed if you wish
        to fine-tune a model with more flexible role ordering!

        The output should look something like:

        <bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos> <bos>[INST] Prompt [/INST] Answer <eos>
        <bos>[INST] Prompt [/INST]
        a1  {% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\n' + content.strip() + '\n<</SYS>>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}ZUSE_DEFAULT_PROMPTtruefalse
z\n'z\'ZDEFAULT_SYSTEM_MESSAGE)rU   r   DEFAULT_SYSTEM_PROMPT)r+   templateZdefault_messager.   r.   r/   default_chat_templatey  s    z$LlamaTokenizer.default_chat_template)r   r   r   NNTFFTFN)F)F)N)N)NF)N)&r;   
__module____qualname____doc__rk   Zvocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesZmodel_input_namesr   r   r#   r   r*   propertyr4   r&   rE   rG   rH   rN   r   rS   rY   r[   r]   rb   r   rp   rw   intboolry   rz   r   __classcell__r.   r.   r-   r/   r   B   sd               6



    
 &  
!r   )#r   rf   shutilr   typingr   r   r   r   r   r   Zsentencepiecer6   Zconvert_slow_tokenizerr
   Ztokenization_utilsr   r   utilsr   Ztokenization_utils_baser   Z
get_loggerr;   r$   rk   r   r   rT   ZB_INSTZE_INSTZB_SYSZE_SYSr   r   r.   r.   r.   r/   <module>   s4    
  	 	