U
    9%eZ                     @   s   d dl mZmZmZmZmZmZ ddlmZm	Z	m
Z
mZmZ ddlmZ ddlmZmZmZmZ ddlmZ G dd	 d	eZd
S )    )DictIteratorListOptionalTupleUnion   )
AddedToken	Tokenizerdecoderspre_tokenizerstrainers)BPE)BertNormalizer	LowercaseSequenceunicode_normalizer_from_str   )BaseTokenizerc                       s@  e Zd ZdZdeeeeeef f  eeeee	eef e	eef f f  eee
f eee eee eed	 fdd	Zeeed
ddZdddgdg ddfeeee f eeeeee
f  eee ee edddZdddgdg dddfeee eee  f eeeeee
f  eee ee eee d	ddZ  ZS )CharBPETokenizera  Original BPE Tokenizer

    Represents the BPE algorithm, as introduced by Rico Sennrich
    (https://arxiv.org/abs/1508.07909)

    The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
    Sennrich subword-nmt implementation by the following options that you can deactivate:
        - adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
            * removing any control characters and replacing all whitespaces by the classic one.
            * handle chinese chars by putting spaces around them.
            * strip all accents.
        - spitting on punctuation in addition to whitespaces (deactivate it with
          `split_on_whitespace_only=True`)
    N<unk></w>FT)	vocabmerges	unk_tokensuffixdropout	lowercaseunicode_normalizerbert_normalizersplit_on_whitespace_onlyc
              	      s  |d k	r,|d k	r,t t|||t||d}
nt tt|||d}
|
t|d k	rd|
t|g g }|rz|t|g7 }|r|tddg7 }|r|t g7 }t|dkrt|dkrt	||
_
n
|d |
_
|	rt |
_n
t |
_tj|d|
_d|||||||	d	}t |
| d S )
N)r   r   end_of_word_suffix)r   r   r!   F)r   r   r   )r   r   )modelr   r   r   r   r   r   r    )r
   r   strZtoken_to_idZadd_special_tokensr   r   r   lenr   Z
normalizerr   ZWhitespaceSplitZpre_tokenizerZBertPreTokenizerr   Z
BPEDecoderdecodersuper__init__)selfr   r   r   r   r   r   r   r   r    Z	tokenizernormalizers
parameters	__class__ h/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/tokenizers/implementations/char_level_bpe.pyr'      sL    


zCharBPETokenizer.__init__)vocab_filenamemerges_filenamec                 K   s   t | |\}}t||f|S )N)r   	read_filer   )r/   r0   kwargsr   r   r-   r-   r.   	from_file\   s    zCharBPETokenizer.from_filei0u  r   i  )files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetr   show_progressc	           
   	   C   s<   t j|||||||d}	t|tr(|g}| jj||	d dS )z%Train the model using the given filesr5   r6   r7   r8   r9   r!   r:   )trainerN)r   
BpeTrainer
isinstancer#   
_tokenizertrain)
r(   r4   r5   r6   r7   r8   r9   r   r:   r<   r-   r-   r.   r@   a   s    	
zCharBPETokenizer.train)	iteratorr5   r6   r7   r8   r9   r   r:   lengthc
              	   C   s.   t j|||||||d}
| jj||
|	d dS )z(Train the model using the given iteratorr;   )r<   rB   N)r   r=   r?   train_from_iterator)r(   rA   r5   r6   r7   r8   r9   r   r:   rB   r<   r-   r-   r.   rC   {   s    	z$CharBPETokenizer.train_from_iterator)	NNr   r   NFNTF)__name__
__module____qualname____doc__r   r   r#   r   intr   r	   floatboolr'   staticmethodr3   r   r@   r   rC   __classcell__r-   r-   r+   r.   r   	   sv            &
Cr   N)typingr   r   r   r   r   r    r	   r
   r   r   r   modelsr   r)   r   r   r   r   Zbase_tokenizerr   r   r-   r-   r-   r.   <module>   s
    