U
    9%e                     @   s   d dl mZmZmZmZmZmZ d dlmZm	Z	m
Z
mZmZmZ d dlmZ d dlmZmZmZ ddlmZ G dd deZd	S )
    )DictIteratorListOptionalTupleUnion)
AddedToken	Tokenizerdecoderspre_tokenizers
processorstrainers)BPE)	LowercaseSequenceunicode_normalizer_from_str   )BaseTokenizerc                       s  e Zd ZdZdeeeeeef f  eeeee	eef e	eef f f  e
e
ee ee ee ee e
d	 fddZeeeddd	Zd
ddg feeee f eee
eeeef  dddZd
ddg dfeee eee  f eee
eeeef  ee dddZ  ZS )ByteLevelBPETokenizerzjByteLevelBPETokenizer

    Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
    NF)	vocabmergesadd_prefix_space	lowercasedropoutunicode_normalizercontinuing_subword_prefixend_of_word_suffixtrim_offsetsc
              	      s   |d k	r0|d k	r0t t||||p d|p&dd}
n
t t }
g }|rP|t|g7 }|r`|t g7 }t|dkrt|dkrt||
_n
|d |
_tj|d|
_	t
 |
_tj|	d|
_d|||||||	d}t |
| d S )	N )r   r   r   r   r   )r   )r   ZByteLevelBPE)modelr   r   r   r   r   r   r   )r	   r   r   r   lenr   Z
normalizerr   	ByteLevelZpre_tokenizerr
   decoderr   Zpost_processorsuper__init__)selfr   r   r   r   r   r   r   r   r   Z	tokenizerZnormalizers
parameters	__class__ h/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/tokenizers/implementations/byte_level_bpe.pyr$      sB    



zByteLevelBPETokenizer.__init__)vocab_filenamemerges_filenamec                 K   s   t | |\}}t||f|S )N)r   	read_filer   )r+   r,   kwargsr   r   r)   r)   r*   	from_fileJ   s    zByteLevelBPETokenizer.from_filei0u     T)files
vocab_sizemin_frequencyshow_progressspecial_tokensc                 C   s>   t j||||tj d}t|tr*|g}| jj||d dS )z%Train the model using the given filesr2   r3   r4   r5   Zinitial_alphabet)trainerN)	r   
BpeTrainerr   r!   alphabet
isinstancestr
_tokenizertrain)r%   r1   r2   r3   r4   r5   r7   r)   r)   r*   r=   O   s    

zByteLevelBPETokenizer.train)iteratorr2   r3   r4   r5   lengthc                 C   s0   t j||||tj d}| jj|||d dS )z(Train the model using the given iteratorr6   )r7   r?   N)r   r8   r   r!   r9   r<   train_from_iterator)r%   r>   r2   r3   r4   r5   r?   r7   r)   r)   r*   r@   d   s    z)ByteLevelBPETokenizer.train_from_iterator)	NNFFNNNNF)__name__
__module____qualname____doc__r   r   r;   r   intr   boolfloatr$   staticmethodr/   r   r   r=   r   r@   __classcell__r)   r)   r'   r*   r   
   s^            &:r   N)typingr   r   r   r   r   r   Z
tokenizersr   r	   r
   r   r   r   Ztokenizers.modelsr   Ztokenizers.normalizersr   r   r   Zbase_tokenizerr   r   r)   r)   r)   r*   <module>   s
     