U
    9%e_                     @   st   d dl mZmZmZmZmZmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZ ddlmZ G dd deZd	S )
    )DictIteratorListOptionalTupleUnion)
AddedToken	Tokenizerdecoderspre_tokenizerstrainers)BPE)NFKC   )BaseTokenizerc                
       s,  e Zd ZdZdeeeeeef f  eeeee	eef e	eef f f  eee
f eeee ee d fdd	Zeeed
ddZdddgdg dfeeee f eeeeee
f  eee edddZdddgdg ddfeee eee  f eeeeee
f  eee eee dddZ  ZS )SentencePieceBPETokenizerzrSentencePiece BPE Tokenizer

    Represents the BPE algorithm, with the pretokenization used by SentencePiece
    N<unk>   ▁TF)vocabmerges	unk_tokenreplacementadd_prefix_spacedropoutfuse_unkc           
         s   |d k	r(|d k	r(t t|||||d}nt t|||d}|t|d k	r\|t|g t |_tj||d|_	t
j||d|_d||||d}	t ||	 d S )N)r   r   r   )r   r   ZSentencePieceBPE)modelr   r   r   r   )r	   r   Ztoken_to_idstrZadd_special_tokensr   Z
normalizerr   Z	MetaspaceZpre_tokenizerr
   decodersuper__init__)
selfr   r   r   r   r   r   r   Z	tokenizer
parameters	__class__ k/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/tokenizers/implementations/sentencepiece_bpe.pyr      s    
z"SentencePieceBPETokenizer.__init__)vocab_filenamemerges_filenamec                 K   s   t | |\}}t||f|S )N)r   	read_filer   )r&   r'   kwargsr   r   r$   r$   r%   	from_file0   s    z#SentencePieceBPETokenizer.from_filei0u     i  )files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetshow_progressc           	      C   s:   t j||||||d}t|tr&|g}| jj||d dS )z%Train the model using the given filesr-   r.   r/   r0   r1   r2   )trainerN)r   
BpeTrainer
isinstancer   
_tokenizertrain)	r    r,   r-   r.   r/   r0   r1   r2   r4   r$   r$   r%   r8   5   s    
zSentencePieceBPETokenizer.train)iteratorr-   r.   r/   r0   r1   r2   lengthc	           
      C   s,   t j||||||d}	| jj||	|d dS )z(Train the model using the given iteratorr3   )r4   r:   N)r   r5   r7   train_from_iterator)
r    r9   r-   r.   r/   r0   r1   r2   r:   r4   r$   r$   r%   r;   M   s    z-SentencePieceBPETokenizer.train_from_iterator)NNr   r   TNF)__name__
__module____qualname____doc__r   r   r   r   intr   r   boolfloatr   staticmethodr*   r   r8   r   r;   __classcell__r$   r$   r"   r%   r   
   sf          &
 r   N)typingr   r   r   r   r   r   Z
tokenizersr   r	   r
   r   r   Ztokenizers.modelsr   Ztokenizers.normalizersr   Zbase_tokenizerr   r   r$   r$   r$   r%   <module>   s
    