U
    9%e                     @   s@  d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZ d	d
lmZ d	dlmZ d	dl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d	dl+m,Z,m-Z-m.Z. e./e0Z1dZ2dZ3dZ4dZ5e!d7 Z!eeeedZ6de2iZ7e-e!G dd de&Z8dS )z
 Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
 see tokenization_utils.py
    N)defaultdict)AnyDictListOptionalTupleUnion)Encoding)	Tokenizer)Decoder)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainer   )convert_slow_tokenizer)PreTrainedTokenizer)
INIT_TOKENIZER_DOCSTRING
AddedTokenBatchEncodingPreTokenizedInputPreTokenizedInputPairPreTrainedTokenizerBaseSpecialTokensMixin	TextInputTextInputPairTruncationStrategy)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
)BPEUnigram	WordLevel	WordPiecetokenizer_filec                       s  e Zd ZU dZeZdZeed<  fddZ	e
edddZe
edd	d
Ze
edddZeeef dddZe
eeef dddZe
eeef dddZe
eeef dddZeeef dddZedddZe
edddZe
edddZdKeee ee eeeeeeeee f e!e f d	d d!Z"e#ee!e f e#ee!e f d"d#d$Z$eed%d&d'Z%eee d(d)d*Z&dLe!e#eef  ed+d,d-Z'dMeed.d/d0Z(dNe#ee!e f ee#ee!e f d1d2d3Z)dOeee ee!e d4d5d6Z*e+e,eeee d7d8d9Z-de+j.e,j/dd:ddddddddddfe#e!e0 e!e1 e!e2 e!e3 f ee+e,ee eeee ee ee ee eeeeee4d;d<d=Z5dde+j.e,j/dd:ddddddddddfe#e0e2f ee#e0e2f  ee+e,ee eeee ee ee ee eeeeee4d>d?d@Z6e!e ed"dAdBZ7dPe#ee!e f eeedCdDdEZ8dQe#ee9j:f ee ee ee ee dFdGdHZ;dRdIdJZ<  Z=S )SPreTrainedTokenizerFastaQ  
    Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

    Handles all the shared methods for tokenization and special tokens, as well as methods for
    downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

    This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    Nslow_tokenizer_classc                    s  | dd }| dd }| dd }| dd}| dd}|rZ|d krZ| jd krZtd|d k	rnt|}nR|d k	r|st|}n:|d k	rt|}n(| jd k	r| j||}t|}ntd|| _|d k	r|	|j
 d| _| jj}	|	d k	rB| jjf |	 |d	|	d	  |d
|	d  |d|	d  |d|	d  n
| j  | jj}
|
d k	r| jjf |
 |d|
d  |d|
d  |d|
d  |d	|
d  |d|
d  t jf | |s| j| jdd d S )Ntokenizer_objectZ__slow_tokenizerr$   	from_slowFslow_to_fastzCannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you have sentencepiece installed.a-  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece installed to convert a slow tokenizer to a fast one.
max_lengthtruncation_side	directionstridetruncation_strategystrategy	pad_tokenpad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofT)special_tokens)popr&   
ValueErrorcopydeepcopyTokenizerFast	from_filer   
_tokenizerupdateinit_kwargs_decode_use_source_tokenizer
truncationenable_truncation
setdefaultno_truncationpaddingenable_paddingsuper__init___add_tokensZall_special_tokens_extended)selfargskwargsr'   Zslow_tokenizerZfast_tokenizer_filer(   r)   Zfast_tokenizer_truncation_padding	__class__ c/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/tokenization_utils_fast.pyrH   ^   sX    





z PreTrainedTokenizerFast.__init__)returnc                 C   s   dS )NTrQ   rJ   rQ   rQ   rR   is_fast   s    zPreTrainedTokenizerFast.is_fastc                 C   s   dS )z
        `bool`: Whether or not the slow tokenizer can be saved. Usually for sentencepiece based slow tokenizer, this
        can only be `True` if the original `"sentencepiece.model"` was not deleted.
        TrQ   rT   rQ   rQ   rR   can_save_slow_tokenizer   s    z/PreTrainedTokenizerFast.can_save_slow_tokenizerc                 C   s   | j jddS )zP
        `int`: Size of the base vocabulary (without the added tokens).
        FZwith_added_tokensr=   Zget_vocab_sizerT   rQ   rQ   rR   
vocab_size   s    z"PreTrainedTokenizerFast.vocab_sizec                 C   s   | j jddS )NTrW   )r=   	get_vocabrT   rQ   rQ   rR   rZ      s    z!PreTrainedTokenizerFast.get_vocabc                 C   s   |   S N)rZ   rT   rQ   rQ   rR   vocab   s    zPreTrainedTokenizerFast.vocabc                 C   s    dd t | j dd dD S )z
        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
        c                 S   s   i | ]\}}|j |qS rQ   content.0vkrQ   rQ   rR   
<dictcomp>   s      z@PreTrainedTokenizerFast.added_tokens_encoder.<locals>.<dictcomp>c                 S   s   | d S Nr   rQ   itemrQ   rQ   rR   <lambda>       z>PreTrainedTokenizerFast.added_tokens_encoder.<locals>.<lambda>keysortedadded_tokens_decoderitemsrT   rQ   rQ   rR   added_tokens_encoder   s    z,PreTrainedTokenizerFast.added_tokens_encoderc                 C   s
   | j  S )z
        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

        Returns:
            `Dict[str, int]`: The added tokens.
        )r=   Zget_added_tokens_decoderrT   rQ   rQ   rR   rm      s    z,PreTrainedTokenizerFast.added_tokens_decoderc                 C   s    dd t | j dd dD S )z
        Returns the added tokens in the vocabulary as a dictionary of token to index.

        Returns:
            `Dict[str, int]`: The added tokens.
        c                 S   s   i | ]\}}|j |qS rQ   r]   r_   rQ   rQ   rR   rc      s      z;PreTrainedTokenizerFast.get_added_vocab.<locals>.<dictcomp>c                 S   s   | d S rd   rQ   re   rQ   rQ   rR   rg      rh   z9PreTrainedTokenizerFast.get_added_vocab.<locals>.<lambda>ri   rk   rT   rQ   rQ   rR   get_added_vocab   s    z'PreTrainedTokenizerFast.get_added_vocabc                 C   s   | j jddS )zD
        Size of the full vocabulary with the added tokens.
        TrW   rX   rT   rQ   rQ   rR   __len__   s    zPreTrainedTokenizerFast.__len__c                 C   s   | j S )zc
        `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
        )r=   rT   rQ   rQ   rR   backend_tokenizer   s    z)PreTrainedTokenizerFast.backend_tokenizerc                 C   s   | j jS )zU
        `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
        )r=   decoderrT   rQ   rQ   rR   rs      s    zPreTrainedTokenizerFast.decoderFT)	encodingreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverboserS   c	                 C   s   |dkrd| j k}|dkr$d| j k}|r@|jdk	r@|g|j }	n|g}	tt}
|	D ]|}|
d |j |rz|
d |j |r|
d |j |r|
d |j |r|
d |j	 |rR|
d t
|j qR|
|	fS )a  
        Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
        of encodings, take care of building a batch from overflowing tokens.

        Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
        lists (overflows) of lists (tokens).

        Output shape: (overflows, sequence length)
        NZtoken_type_idsattention_mask	input_idsspecial_tokens_maskZoffset_mappingr4   )Zmodel_input_namesZoverflowingr   listappendidsZtype_idsr|   r~   offsetslen)rJ   rt   ru   rv   rw   rx   ry   rz   r{   	encodingsZencoding_dicterQ   rQ   rR   _convert_encoding   s*    

z)PreTrainedTokenizerFast._convert_encoding)tokensrS   c                    s2   |dkrdS t |tr  |S  fdd|D S )aT  
        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
        vocabulary.

        Args:
            tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).

        Returns:
            `int` or `List[int]`: The token id or list of token ids.
        Nc                    s   g | ]}  |qS rQ   )#_convert_token_to_id_with_added_vocr`   tokenrT   rQ   rR   
<listcomp>+  s     zAPreTrainedTokenizerFast.convert_tokens_to_ids.<locals>.<listcomp>)
isinstancestrr   rJ   r   rQ   rT   rR   convert_tokens_to_ids  s
    

z-PreTrainedTokenizerFast.convert_tokens_to_ids)r   rS   c                 C   s   | j |}|d kr| jS |S r[   )r=   token_to_idZunk_token_id)rJ   r   indexrQ   rQ   rR   r   -  s    z;PreTrainedTokenizerFast._convert_token_to_id_with_added_voc)r   rS   c                 C   s   | j t|S r[   )r=   id_to_tokenint)rJ   r   rQ   rQ   rR   _convert_id_to_token3  s    z,PreTrainedTokenizerFast._convert_id_to_token)
new_tokensrS   c                 C   s   |r| j |S | j |S r[   )r=   add_special_tokensZ
add_tokens)rJ   r   r6   rQ   rQ   rR   rI   6  s    z#PreTrainedTokenizerFast._add_tokens)pairrS   c                 C   s   | j |S )aG  
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        )r=   num_special_tokens_to_add)rJ   r   rQ   rQ   rR   r   <  s    z1PreTrainedTokenizerFast.num_special_tokens_to_add)r   skip_special_tokensrS   c                 C   sR   t |tr| j|S g }|D ].}t|}|r:|| jkr:q|| j| q|S )a  
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `List[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `List[str]`: The decoded token(s).
        )r   r   r=   r   Zall_special_idsr   )rJ   r   r   r   r   rQ   rQ   rR   convert_ids_to_tokensQ  s    
z-PreTrainedTokenizerFast.convert_ids_to_tokens)textr   r   rS   c                 K   s   | j f |||d| S )N)r   	text_pairr   )Zencode_plusr   )rJ   r   r   r   rL   rQ   rQ   rR   tokenizek  s    z PreTrainedTokenizerFast.tokenizepadding_strategyr.   r*   r-   r5   c           
         s   | j j | j j}|tjkr. dk	rv| j   nH|||j| jd} dkrNd}n fdd|D }||krv| j jf | |t	j
kr|dk	r| j   nB|t	jkr|nd}	|	| j| j| j| j|d}||kr| j jf | dS )a  
        Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
        library) and restore the tokenizer settings afterwards.

        The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
        padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
        section.

        Args:
            padding_strategy ([`~utils.PaddingStrategy`]):
                The kind of padding that will be applied to the input
            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
                The kind of truncation that will be applied to the input
            max_length (`int`):
                The maximum size of a sequence.
            stride (`int`):
                The stride to use when handling overflow.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
        N)r*   r-   r/   r,   c                    s   i | ]}|  |d qS r[   get)r`   rb   rM   rQ   rR   rc     s      zFPreTrainedTokenizerFast.set_truncation_and_padding.<locals>.<dictcomp>)r4   r,   Zpad_idr0   r2   r5   )r=   rA   rE   r   DO_NOT_TRUNCATErD   valuer+   rB   r   
DO_NOT_PADZ
no_paddingZ
MAX_LENGTHr3   Zpad_token_idr0   r1   rF   )
rJ   r   r.   r*   r-   r5   rN   targetcurrentr4   rQ   r   rR   set_truncation_and_paddingn  s8    

z2PreTrainedTokenizerFast.set_truncation_and_paddingr   )batch_text_or_text_pairsr   r   r.   r*   r-   is_split_into_wordsr5   return_tensorsru   rv   rw   rx   ry   rz   r{   rS   c                    s  t |ttfs"tdt| dj|||||d jj|||d}fdd|D }i }|d d  D ]  fdd|D }|| < q|d	d |D }rg }t	|D ]"\}\}}||gt
|d
  7 }q||d< |d
 D ]}|| qt|||	dS )Nz:batch_text_or_text_pairs has to be a list or a tuple (got )r   )r   Zis_pretokenizedc                    s&   g | ]}j | d qS ))rt   ru   rv   rw   rx   ry   rz   r{   )r   )r`   rt   )rv   rz   ry   rw   rx   ru   rJ   r{   rQ   rR   r     s   z>PreTrainedTokenizerFast._batch_encode_plus.<locals>.<listcomp>r   c                    s"   g | ]\}}|  D ]}|qqS rQ   rQ   )r`   rf   _r   ri   rQ   rR   r     s     
  c                 S   s   g | ]\}}|D ]}|qqS rQ   rQ   )r`   r   rf   r   rQ   rQ   rR   r     s       r}   overflow_to_sample_mapping)Ztensor_type)r   tupler   	TypeErrortyper   r=   Zencode_batchkeys	enumerater   &_eventual_warn_about_too_long_sequencer   )rJ   r   r   r   r.   r*   r-   r   r5   r   ru   rv   rw   rx   ry   rz   r{   r   Ztokens_and_encodingsZsanitized_tokensstackZsanitized_encodingsr   itoksr   r}   rQ   )	rj   rv   rz   ry   rw   rx   ru   rJ   r{   rR   _batch_encode_plus  s@    
z*PreTrainedTokenizerFast._batch_encode_plus)r   r   r   r   r.   r*   r-   r   r5   r   ru   rv   rw   rx   ry   rz   r{   rS   c                 K   s   |r||fgn|g}| j |f|||||||	|
|||||||d|}|
d krl|sltdd | D |j}| |d || |S )N)r   r   r   r.   r*   r-   r5   r   ru   rv   rw   rx   ry   rz   r{   c                 S   s8   i | ]0\}}|t |d kr0t|d  tr0|d  n|qS )r   )r   r   r   )r`   rj   r   rQ   rQ   rR   rc   8  s    z8PreTrainedTokenizerFast._encode_plus.<locals>.<dictcomp>r}   )r   r   rn   r   r   )rJ   r   r   r   r   r.   r*   r-   r   r5   r   ru   rv   rw   rx   ry   rz   r{   rL   Zbatched_inputZbatched_outputrQ   rQ   rR   _encode_plus
  s>    z$PreTrainedTokenizerFast._encode_plusc                 C   s   | j j|S r[   )rr   rs   decoder   rQ   rQ   rR   convert_tokens_to_stringC  s    z0PreTrainedTokenizerFast.convert_tokens_to_string)	token_idsr   clean_up_tokenization_spacesrS   c                 K   sZ   | dd| _t|tr|g}| jj||d}|d k	r:|n| j}|rR| |}|S |S d S )NZuse_source_tokenizerF)r   )r7   r@   r   r   r=   r   r   Zclean_up_tokenization)rJ   r   r   r   rL   r   Z
clean_textrQ   rQ   rR   _decodeF  s    

zPreTrainedTokenizerFast._decode)save_directory
file_nameslegacy_formatfilename_prefixrS   c              	   C   s  t |}| jdkr"|dkr"td|dks2|dko@| jdk	o@| j}|dkpP|dk}|rtj||rj|d ndt }|  }|rt	|ddd	&}	t
j|d
dddd }
|	|
 W 5 Q R X | j||d}|| |f }|rtj||r|d ndt }| j| ||f }|S )z
        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
        file containing {config + vocab + added-tokens}.
        NTzYour tokenizer does not have a legacy version defined and therefore cannot register this version. You might consider leaving the legacy_format at `None` or setting it to `False`.F- wzutf-8)rt      )indent	sort_keysensure_ascii
)r   )r   r&   r8   rV   ospathjoinADDED_TOKENS_FILErp   openjsondumpswriteZsave_vocabularyTOKENIZER_FILErr   save)rJ   r   r   r   r   Z	save_slowZ	save_fastZadded_tokens_fileZadded_vocabfZout_strZvocab_filesr$   rQ   rQ   rR   _save_pretrained^  s>      
z(PreTrainedTokenizerFast._save_pretrainedc              	      s  t | j }|d}|d}	d}
|d d dkrRi |d d< g |d d< n|d d d	kr|d d
 dk	r|d d
 }|d d | d }
 dk	r|
 kr |
 }
d|d d
< |
dgg|d d< n6|d d dkri |d d< ntd|d d  d dk	rBd|d krB|d d  krB |d d  |d d< tt |g }|D ]r}|dd}|dd}|d d d	kr|sqZ dk	r|d  kr |d  |d< |	t
f | qZ|dk	r|| |d d dkr d|kr |d d dk	r |d d |d< |d d dkr^d|kr^|d d dk	r^|d d |d< |d d d	kr|
dk	r|
|d< |d dk	r|d d dkrtj |d< t|d d  }|f ||d|}j|||d |	dk	rt  }d|	krp|	d D ]`}|	d | d } dk	r> fdd|D }||	d | d< fdd|D |	d | d < qd!D ]N}||	krt|	| \}} dk	r| kr | }|}||g|	|< qt|	|d< tt || j }tj }|d" |D ]}t| d#| dk	rt| |} dk	r>| kr> | }t| d#| }t|t
r|t
||j|j|j|jd$d%||< n|||< q| j}|dk	r|| t|dkr||d"< | jf d&i|S )'uf  
        Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
        as the current one.

        Args:
            text_iterator (generator of `List[str]`):
                The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
                if you have everything in memory.
            vocab_size (`int`):
                The size of the vocabulary you want for your tokenizer.
            length (`int`, *optional*):
                The total number of sequences in the iterator. This is used to provide meaningful progress tracking
            new_special_tokens (list of `str` or `AddedToken`, *optional*):
                A list of new special tokens to add to the tokenizer you are training.
            special_tokens_map (`Dict[str, str]`, *optional*):
                If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
                token name to new special token name in this argument.
            kwargs (`Dict[str, Any]`, *optional*):
                Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

        Returns:
            [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
            `text_iterator`.

        added_tokenspost_processorNmodelr   r    r\   Zmergesr!   unk_idr   g        )r"   r#   z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.	unk_tokenspecialidr^   Zcontinuing_subword_prefixZend_of_word_suffixZpre_tokenizer	ByteLevelZinitial_alphabet)rY   r6   )r4   trainerr6   r   c                    s   g | ]}  ||qS rQ   r   r   )special_tokens_maprQ   rR   r     s     zCPreTrainedTokenizerFast.train_new_from_iterator.<locals>.<listcomp>c                    s   g | ]}  |qS rQ   )r   r   )	tokenizerrQ   rR   r     s     r   )clssepadditional_special_tokensr   T)single_wordlstriprstrip
normalizedr   r'   ) r   loadsr=   Zto_strr7   r8   r;   Zfrom_strr   r   r   extendpre_tokenizers_fastr   alphabetMODEL_TO_TRAINER_MAPPINGZtrain_from_iteratorr   r?   r9   r   ZSPECIAL_TOKENS_ATTRIBUTESremovegetattrr   r   r   r   r   r   r   rP   )rJ   Ztext_iteratorrY   r4   Znew_special_tokensr   rL   Ztokenizer_jsonr   r   r   r   r6   Zadded_tokenr   r   Ztrainer_classr   Ztrained_tokenizer_jsonrj   r   Zspecial_tokenr   Ztoken_idZspecial_tokens_listZspecial_token_fullr   rQ   )r   r   rR   train_new_from_iterator  s    "




 


"





	

z/PreTrainedTokenizerFast.train_new_from_iterator)NNFFFFT)F)F)F)NF)FN)NN)NNN)>__name__
__module____qualname____doc__VOCAB_FILES_NAMESZvocab_files_namesr&   r   __annotations__rH   propertyboolrU   rV   r   rY   r   r   rZ   r\   ro   r   rm   rp   rq   r;   rr   DecoderFastrs   EncodingFastr   r   r   r   r   r   r   r   r   rI   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   PathLiker   r   __classcell__rQ   rQ   rO   rR   r%   M   s2  
D			       /(  LX
9    4   r%   )9r   r9   r   r   collectionsr   typingr   r   r   r   r   r   Ztokenizers.pre_tokenizersZpre_tokenizersr   Z
tokenizersr	   r   r
   r;   Ztokenizers.decodersr   r   Ztokenizers.trainersr   r   r   r   r   Ztokenization_utilsr   Ztokenization_utils_baser   r   r   r   r   r   r   r   r   r   utilsr   r   r   Z
get_loggerr   loggerr   ZSPECIAL_TOKENS_MAP_FILEZTOKENIZER_CONFIG_FILEr   r   r   r%   rQ   rQ   rQ   rR   <module>   s8    0

