U
    9%e                     @   s   d dl mZmZmZmZmZ d dlmZmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ ddlmZ G d	d
 d
eZdS )    )DictIteratorListOptionalUnion)
AddedToken	Tokenizerdecoderstrainers)	WordPiece)BertNormalizer)BertPreTokenizer)BertProcessing   )BaseTokenizerc                       sB  e Zd ZdZdeeeeeef f  eee	f eee	f eee	f eee	f eee	f e
e
ee
 e
ed
 fddZeedddZdddg dddddgdd	feeee f eeeee eeee	f  e
edddZdddg dddddgdd	dfeee eee  f eeeee eeee	f  e
eee d	ddZ  ZS )BertWordPieceTokenizerzBert WordPiece TokenizerN[UNK][SEP][CLS][PAD][MASK]T##)vocab	unk_token	sep_token	cls_token	pad_token
mask_token
clean_texthandle_chinese_charsstrip_accents	lowercasewordpieces_prefixc                    s  |d k	rt t|t|d}nt tt|d}|t|d k	rR|t|g |t|d k	rt|t|g |t|d k	r|t|g |t|d k	r|t|g |t|d k	r|t|g t|||	|
d|_t |_|d k	rZ|t|}|d krt	d|t|}|d kr>t	dt
t||ft||f|_tj|d|_d||||||||	|
|d}t || d S )N)r   )r   r   r    r!   z%sep_token not found in the vocabularyz%cls_token not found in the vocabulary)prefixZBertWordPiece)modelr   r   r   r   r   r   r   r    r!   r"   )r   r   strZtoken_to_idZadd_special_tokensr   Z
normalizerr   Zpre_tokenizer	TypeErrorr   Zpost_processorr	   decodersuper__init__)selfr   r   r   r   r   r   r   r   r    r!   r"   Z	tokenizerZsep_token_idZcls_token_id
parameters	__class__ h/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/tokenizers/implementations/bert_wordpiece.pyr)      sT    


zBertWordPieceTokenizer.__init__)r   c                 K   s   t | } t| f|S )N)r   	read_filer   )r   kwargsr.   r.   r/   	from_fileQ   s    
z BertWordPieceTokenizer.from_filei0u     i  )files
vocab_sizemin_frequencylimit_alphabetinitial_alphabetspecial_tokensshow_progressr"   c	           
   	   C   s<   t j|||||||d}	t|tr(|g}| jj||	d dS )z%Train the model using the given filesr5   r6   r7   r8   r9   r:   Zcontinuing_subword_prefix)trainerN)r
   WordPieceTrainer
isinstancer%   
_tokenizertrain)
r*   r4   r5   r6   r7   r8   r9   r:   r"   r<   r.   r.   r/   r@   V   s    	
zBertWordPieceTokenizer.train)	iteratorr5   r6   r7   r8   r9   r:   r"   lengthc
              	   C   s.   t j|||||||d}
| jj||
|	d dS )z(Train the model using the given iteratorr;   )r<   rB   N)r
   r=   r?   train_from_iterator)r*   rA   r5   r6   r7   r8   r9   r:   r"   rB   r<   r.   r.   r/   rC   v   s    	z*BertWordPieceTokenizer.train_from_iterator)Nr   r   r   r   r   TTNTr   )__name__
__module____qualname____doc__r   r   r%   r   intr   boolr)   staticmethodr2   r   r@   r   rC   __classcell__r.   r.   r,   r/   r      s              




B#r   N)typingr   r   r   r   r   Z
tokenizersr   r   r	   r
   Ztokenizers.modelsr   Ztokenizers.normalizersr   Ztokenizers.pre_tokenizersr   Ztokenizers.processorsr   Zbase_tokenizerr   r   r.   r.   r.   r/   <module>   s   