U
    ,-eM                  5   @   sF  d Z ddlZddlmZmZmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZmZmZ ddlmZmZ dd	lmZ dZddZG dd dZeedddZG dd dZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G d d! d!eZ%G d"d# d#eZ&G d$d% d%eZ'G d&d' d'eZ(G d(d) d)eZ)G d*d+ d+e)Z*G d,d- d-e)Z+G d.d/ d/e)Z,G d0d1 d1e)Z-G d2d3 d3e)Z.G d4d5 d5e)Z/G d6d7 d7e)Z0G d8d9 d9e)Z1G d:d; d;e)Z2G d<d= d=e)Z3G d>d? d?e)Z4G d@dA dAe)Z5G dBdC dCe)Z6G dDdE dEe)Z7G dFdG dGeZ8G dHdI dIe)Z9G dJdK dKeZ:G dLdM dMeZ;G dNdO dOeZ<G dPdQ dQe)Z=G dRdS dSe)Z>G dTdU dUeZ?e*e&e+ee9e<e,e:e$ee(e-eeeeee*e!e$e%eee&e1e&e&ee?e.e/e"ee&e0e#e6ee3e4ee&e'ee7e8e1e2e e=e>e>dV4Z@edWdXdYZAdS )[z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)DictListTuple)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece   )is_protobuf_availablerequires_backends)PROTOBUF_IMPORT_ERROR c                 C   sV   t  rDdd l}t|jjtdk r4ddlm} nddlm} |S t	t
| d S )Nr   z4.0.0)sentencepiece_model_pb2)sentencepiece_model_pb2_new)r   Zgoogle.protobufr   parseprotobuf__version__Ztransformers.utilsr   r   ImportErrorr   format)error_messageZgoogler    r   d/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/convert_slow_tokenizer.pyimport_protobuf!   s    r   c                   @   sB   e Zd ZdZedddZd	eeeef e	e f dddZ
dS )
SentencePieceExtractorzl
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
    )modelc                 C   s.   t | d ddlm} | | _| j| d S )Nsentencepiecer   )SentencePieceProcessor)r   r"   r#   spLoad)selfr!   r#   r   r   r   __init__3   s    
zSentencePieceExtractor.__init__Nreturnc           
         s   | j   fddt  D |dk	r8t|d }}n
d }}g }| D ]x\}}g }tdt|D ]>}|d| ||d  }}	|krh|	krh|||	|f qht|fddd	}|| qNt|d
d |d}dd |D }|fS )z
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        c                    s   i | ]}  ||qS r   )Zid_to_piece).0index)r$   r   r   
<dictcomp>@   s      z2SentencePieceExtractor.extract.<locals>.<dictcomp>NTFr   c                    s    | d   | d  fS )Nr   r   r   )x)vocabr   r   <lambda>N       z0SentencePieceExtractor.extract.<locals>.<lambda>)keyc                 S   s   | d S )N   r   )valr   r   r   r/   Q   r0   )r1   reversec                 S   s   g | ]}|d  |d fqS )r   r   r   )r*   r3   r   r   r   
<listcomp>R   s     z2SentencePieceExtractor.extract.<locals>.<listcomp>)	r$   rangeZGetPieceSizedictitemslenappendsortedextend)
r&   vocab_scoresr4   mergesmergeZpiece_scorelocalr+   Zpiece_lZpiece_rr   )r$   r.   r   extract:   s"    
zSentencePieceExtractor.extract)N)__name__
__module____qualname____doc__strr'   r   r   intr   rA   r   r   r   r   r    .   s   r    )piecer)   c                 C   s&   t | dk p$| d dkp$| d   S )Nr2   ,)r9   isdigit)rH   r   r   r   check_number_commaV   s    rM   c                   @   s"   e Zd Zdd ZedddZdS )	Converterc                 C   s
   || _ d S N)original_tokenizer)r&   rP   r   r   r   r'   [   s    zConverter.__init__r(   c                 C   s
   t  d S rO   )NotImplementedErrorr&   r   r   r   	converted^   s    zConverter.convertedN)rB   rC   rD   r'   r   rS   r   r   r   r   rN   Z   s   rN   c                   @   s   e Zd ZedddZdS )BertConverterr(   c           
      C   s   | j j}tt|t| j jd}d}d}d}t| j drV| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )N	unk_tokenFbasic_tokenizerTZ
clean_textZhandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixrP   r.   r   r   rF   rV   hasattrrW   tokenize_chinese_charsrY   do_lower_caser
   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr   TemplateProcessingpost_processorr	   decoder
r&   r.   	tokenizerrh   rY   ri   clsseprp   rq   r   r   r   rS   c   s:    



zBertConverter.convertedNrB   rC   rD   r   rS   r   r   r   r   rT   b   s   rT   c                   @   s   e Zd ZedddZdS )SplinterConverterr(   c              
   C   sZ  | j j}tt|t| j jd}d}d}d}t| j drV| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}t| j j}d}	| j j}
| j j}| j j}| j d}| j jdkr| d| d	|	 d	| d
| d
}n"| d| d
| d	|	 d	| d
}tj| d| d|||
f||f||f|	|fgd|_tjdd|_|S )NrU   FrW   TrX   .rightr[    r]   r^   r\   r_   rc   rd   )rP   r.   r   r   rF   rV   rg   rW   rh   rY   ri   r
   rj   rk   r   rl   rm   rn   ro   Zquestion_tokenrp   rq   question_token_idconvert_tokens_to_idsZpadding_sider   rr   rs   r	   rt   )r&   r.   rv   rh   rY   ri   rw   rx   questiondotrp   rq   r~   Zdot_token_idra   r   r   r   rS      sL    



$"
zSplinterConverter.convertedNry   r   r   r   r   rz      s   rz   c                   @   s   e Zd ZedddZdS )FunnelConverterr(   c           
      C   s   | j j}tt|t| j jd}d}d}d}t| j drV| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )NrU   FrW   TrX   z:2 $A:0 r\   r]   r^   r_   rc   rd   rf   ru   r   r   r   rS      s:    



zFunnelConverter.convertedNry   r   r   r   r   r      s   r   c                   @   s   e Zd ZedddZdS )MPNetConverterr(   c           
   
   C   s   | j j}tt|t| j jd}d}d}d}t| j drV| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	| d
||f||	fgd|_tjdd|_|S )NrU   FrW   TrX   r[   r\   :0 r]   r^   r_   rc   rd   rf   ru   r   r   r   rS      s:    



zMPNetConverter.convertedNry   r   r   r   r   r      s   r   c                   @   s   e Zd ZedddZdS )OpenAIGPTConverterr(   c              	   C   s   | j j}t| j j }| j j}tt||d t|ddd}|	t|d k	r^|
t|g tjdd|_t |_tjdd|_|S )N</w>F)r.   r>   dropoutrV   end_of_word_suffixfuse_unkT)rZ   suffix)rP   encoderlist	bpe_rankskeysrV   r   r   rF   Ztoken_to_idadd_special_tokensr
   rj   rk   r   rl   rm   r	   
BPEDecoderrt   r&   r.   r>   rV   rv   r   r   r   rS   
  s&    
zOpenAIGPTConverter.convertedNry   r   r   r   r   r   	  s   r   c                   @   s   e Zd ZedddZdS )GPT2Converterr(   c              	   C   s   | j j}t| j j }tt||d dddd}tj| j j	d|_
t |_| j jr| j j}| j j}tj| d| d||fgd|_ntjdd|_|S )	Nr   Fr.   r>   r   continuing_subword_prefixr   r   add_prefix_spacez:0 $A:0z:0 $A:0 $B:1r_   )trim_offsets)rP   r   r   r   r   r   r   r   	ByteLevelr   rm   r	   rt   add_bos_token	bos_tokenbos_token_idr   rr   rs   )r&   r.   r>   rv   bosr   r   r   r   rS   %  s2    


zGPT2Converter.convertedNry   r   r   r   r   r   $  s   r   c                   @   s   e Zd ZedddZdS )HerbertConverterr(   c                 C   s   d}d}| j j}t| j j }||d d kr<|dd  }tt||d | j j|d}tj	ddd|_
t |_tj|d|_tj| j j| j jf| j j| j jfd	|_|S )
Nz	#version:r   r   r   )r   rV   r   F)rZ   rY   r   )rx   rw   )rP   r   r   r   r   r   r   rV   r
   rj   rk   r   rl   rm   r	   r   rt   r   ZBertProcessingro   rq   rn   rp   rs   )r&   Ztokenizer_info_strZtoken_suffixr.   r>   rv   r   r   r   rS   H  s.    

zHerbertConverter.convertedNry   r   r   r   r   r   G  s   r   c                   @   s   e Zd ZedddZdS )RobertaConverterr(   c              	   C   sv   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tj|j|jf|j|jf|j	dd|_|S )Nr   Fr   r   Trx   rw   r   r   )rP   r   r   r   r   r   r   r   r   r   rm   r	   rt   r   RobertaProcessingro   rq   rn   rp   rs   r&   otr.   r>   rv   r   r   r   rS   g  s,    


zRobertaConverter.convertedNry   r   r   r   r   r   f  s   r   c                   @   s   e Zd ZedddZdS )RoFormerConverterr(   c           
      C   s   ddl m} | jj}tt|t| jjd}d}d}t| jdrT| jj	j
}| jj	j}tjdd||d|_tj|||_t| jj}t| jj}| jj}| jj}	tj| d| d	| d| d
| d||f||	fgd|_tjdd|_|S )Nr   )JiebaPreTokenizerrU   FrW   TrX   r[   r\   r]   r^   r_   rc   rd   )Z"models.roformer.tokenization_utilsr   rP   r.   r   r   rF   rV   rg   rW   rY   ri   r
   rj   rk   r   ZPreTokenizerZcustomrm   rn   ro   rp   rq   r   rr   rs   r	   rt   )
r&   r   r.   rv   rY   ri   rw   rx   rp   rq   r   r   r   rS     s8    

zRoFormerConverter.convertedNry   r   r   r   r   r     s   r   c                   @   s   e Zd ZedddZdS )DebertaConverterr(   c              	   C   s~   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjddd| j dfd| j dfgd	|_|S )
Nr   Fr   r   [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r_   )rP   r   r   r   r   r   r   r   r   r   rm   r	   rt   r   rr   r   rs   r   r   r   r   rS     s.    
	zDebertaConverter.convertedNry   r   r   r   r   r     s   r   c                       sb   e Zd Z fddZdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
edddZ  ZS )SpmConverterc              	      sv   t | d t j|  t }| }t| jjd}||	  W 5 Q R X || _
| j
jjrrt| dd srtd d S )Nr   rbhandle_byte_fallbacka  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superr'   r   Z
ModelProtoopenrP   
vocab_fileZParseFromStringreadprototrainer_specbyte_fallbackgetattrwarningswarn)r&   argsZ	model_pb2mf	__class__r   r   r'     s    

zSpmConverter.__init__c                 C   s   dd |j D S )Nc                 S   s   g | ]}|j |jfqS r   rH   scorer*   rH   r   r   r   r5     s     z&SpmConverter.vocab.<locals>.<listcomp>piecesr&   r   r   r   r   r.     s    zSpmConverter.vocabc                 C   s   |j jS rO   )r   unk_idr   r   r   r   r     s    zSpmConverter.unk_idc           	      C   s   |j j}| |}| |}|dkr4tt||}nP|dkr|t| jj	 \}}dd t
|D }tt|||j jdd}ntd|S )Nr   r2   c                 S   s   i | ]\}\}}||qS r   r   )r*   iwordr   r   r   r   r,     s    
  z*SpmConverter.tokenizer.<locals>.<dictcomp>T)rV   r   ]You're trying to run a `Unigram` model but you're file was trained with a different algorithm)r   
model_typer.   r   r   r   r    rP   r   rA   	enumerater   	unk_piece	Exception)	r&   r   r   r=   r   rv   _r>   	bpe_vocabr   r   r   rv     s(    

	zSpmConverter.tokenizerc                 C   sH   |j j}|s$tttddgS tt|ttddgS d S N {2,}r}   )normalizer_specprecompiled_charsmapr
   SequenceReplacer   Precompiled)r&   r   r   r   r   r   rk     s    zSpmConverter.normalizerc                 C   s   t j||dS Nreplacementr   )r   	Metaspacer&   r   r   r   r   r   rm     s    zSpmConverter.pre_tokenizerc                 C   s   d S rO   r   rR   r   r   r   rs     s    zSpmConverter.post_processorc                 C   s   t j||dS r   )r	   r   r   r   r   r   rt     s    zSpmConverter.decoderr(   c                 C   sl   |  | j}| | j}|d k	r&||_d}d}| ||}|d k	rH||_| |||_|  }|rh||_|S )N   ▁T)rv   r   rk   rm   rt   rs   )r&   rv   rk   r   r   rm   rs   r   r   r   rS     s    zSpmConverter.converted)rB   rC   rD   r'   r.   r   rv   rk   rm   rs   rt   r   rS   __classcell__r   r   r   r   r     s   	r   c                   @   s$   e Zd Zdd Zdd Zdd ZdS )AlbertConverterc                 C   s   dd |j D S )Nc                 S   s2   g | ]*}t |jr|j|jfn|j|jd  fqS d   rM   rH   r   r   r   r   r   r5   )  s   z)AlbertConverter.vocab.<locals>.<listcomp>r   r   r   r   r   r.   (  s    zAlbertConverter.vocabc                 C   s   t ddt ddg}| jjs<|t   |t   | jjrR|t   |j	j
}|rn|t | |t tdd t |S Nz``"z''r   r}   r
   r   rP   keep_accentsr:   NFKDStripAccentsri   	Lowercaser   r   r   r   r   r&   r   Zlist_normalizersr   r   r   r   rk   .  s    

zAlbertConverter.normalizerc                 C   s,   t jddd| jdfd| jdfgdS Nr   r   r   r   r_   r   rr   rP   r   rR   r   r   r   rs   A  s    zAlbertConverter.post_processorNrB   rC   rD   r.   rk   rs   r   r   r   r   r   '  s   r   c                   @   s   e Zd Zdd Zdd ZdS )BarthezConverterc                 C   s   d}|S N   r   r&   r   r   r   r   r   r   M  s    zBarthezConverter.unk_idc                 C   s,   t jddd| jdfd| jdfgdS Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r_   r   rR   r   r   r   rs   Q  s    zBarthezConverter.post_processorN)rB   rC   rD   r   rs   r   r   r   r   r   L  s   r   c                   @   s$   e Zd Zdd Zdd Zdd ZdS )CamembertConverterc                 C   s8   dddddg}|dd |j dd  D 7 }|d	g7 }|S )
N)z
<s>NOTUSED        z<pad>r   )z</s>NOTUSEDr   <unk>r   )z<unk>NOTUSEDic                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r5   f  s     z,CamembertConverter.vocab.<locals>.<listcomp>r   z<mask>r   r   r&   r   r.   r   r   r   r.   ]  s    
zCamembertConverter.vocabc                 C   s   dS r   r   r   r   r   r   r   j  s    zCamembertConverter.unk_idc                 C   s,   t jddd| jdfd| jdfgdS r   r   rR   r   r   r   rs   n  s    z!CamembertConverter.post_processorNrB   rC   rD   r.   r   rs   r   r   r   r   r   \  s   r   c                   @   s$   e Zd Zdd Zdd Zdd ZdS )DebertaV2Converterc                 C   s<   g }| j jr|tjdd |tj||d t|S )Nisolated)behaviorr   )rP   Zsplit_by_punctr:   r   Punctuationr   r   )r&   r   r   Zlist_pretokenizersr   r   r   rm   z  s
    z DebertaV2Converter.pre_tokenizerc                 C   sd   g }| j jr|t  |t  |jj}|rD|t| |t	t
dd t|S r   )rP   ri   r:   r
   r   Stripr   r   r   r   r   r   r   r   r   r   rk     s    zDebertaV2Converter.normalizerc                 C   s,   t jddd| jdfd| jdfgdS r   r   rR   r   r   r   rs     s    z!DebertaV2Converter.post_processorN)rB   rC   rD   rm   rk   rs   r   r   r   r   r   y  s   r   c                   @   s$   e Zd Zdd Zdd Zdd ZdS )MBartConverterc                 C   sp   ddddg}|dd |j dd  D 7 }|dd	d
dddddddddddddddddddddd g7 }|d!g7 }|S )"Nr   r   r   r   r   r   c                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r5     s     z(MBartConverter.vocab.<locals>.<listcomp>r   Zar_ARr   cs_CZr   de_DEr   en_XXr   Zes_XXr   et_EEr   fi_FIr   Zfr_XXr   gu_INr   hi_INr   it_ITr   Zja_XXr   kk_KZr   ko_KRr   lt_LTr   lv_LVr   Zmy_MMr   ne_NPr   Znl_XXr   ro_ROr   ru_RUr   si_LKr   tr_TRr   vi_VNr   zh_CNr   r   r   r   r   r   r   r.     sF    
zMBartConverter.vocabc                 C   s   dS r   r   r   r   r   r   r     s    zMBartConverter.unk_idc                 C   s,   t jddd| jdfd| jdfgdS )Nz$A </s> en_XXz$A $B </s> en_XXr  r   r_   r   rR   r   r   r   rs     s    zMBartConverter.post_processorNr   r   r   r   r   r     s   &r   c                   @   s$   e Zd Zdd Zdd Zdd ZdS )MBart50Converterc              5   C   s   ddddg}|dd |j dd  D 7 }|dd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;g47 }|d<g7 }|S )=Nr   r   r   r   c                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r5     s     z*MBart50Converter.vocab.<locals>.<listcomp>r   r  r  r  r  r  r	  r  r  r  r  r  r  r  r  r  r  r  r  r   r!  r#  r%  r'  r)  r+  )af_ZAr   )az_AZr   )bn_INr   )fa_IRr   )he_ILr   )hr_HRr   )id_IDr   )ka_GEr   )Zkm_KHr   )mk_MKr   )ml_INr   )mn_MNr   )mr_INr   )pl_PLr   )ps_AFr   )Zpt_XXr   )sv_SEr   )sw_KEr   )ta_INr   )te_INr   )th_THr   )Ztl_XXr   )uk_UAr   )ur_PKr   )xh_ZAr   )gl_ESr   )sl_SIr   r   r   r   r   r   r   r.     s    p
zMBart50Converter.vocabc                 C   s   dS r   r   r   r   r   r   r     s    zMBart50Converter.unk_idc                 C   s,   t jddd| jdfd| jdfgdS )Nzen_XX $A </s>zen_XX $A $B </s>r  r   r_   r   rR   r   r   r   rs     s    zMBart50Converter.post_processorNr   r   r   r   r   r-    s   r-  c                   @   s$   e Zd Zdd Zdd Zdd ZdS )NllbConverterc                 C   s  ddddg}|dd |j dd  D 7 }|dd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`dadbdcdddedfdgdhdidjdkdldmdndodpdqdrdsdtdudvdwdxdydzd{d|d}d~dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddg7 }|dg7 }|S )Nr   r   r   r   c                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r5     s     z'NllbConverter.vocab.<locals>.<listcomp>r   )Zace_Arabr   )Zace_Latnr   )Zacm_Arabr   )Zacq_Arabr   )Zaeb_Arabr   )Zafr_Latnr   )Zajp_Arabr   )Zaka_Latnr   )Zamh_Ethir   )Zapc_Arabr   )Zarb_Arabr   )Zars_Arabr   )Zary_Arabr   )Zarz_Arabr   )Zasm_Bengr   )Zast_Latnr   )Zawa_Devar   )Zayr_Latnr   )Zazb_Arabr   )Zazj_Latnr   )Zbak_Cyrlr   )Zbam_Latnr   )Zban_Latnr   )Zbel_Cyrlr   )Zbem_Latnr   )Zben_Bengr   )Zbho_Devar   )Zbjn_Arabr   )Zbjn_Latnr   )Zbod_Tibtr   )Zbos_Latnr   )Zbug_Latnr   )Zbul_Cyrlr   )Zcat_Latnr   )Zceb_Latnr   )Zces_Latnr   )Zcjk_Latnr   )Zckb_Arabr   )Zcrh_Latnr   )Zcym_Latnr   )Zdan_Latnr   )Zdeu_Latnr   )Zdik_Latnr   )Zdyu_Latnr   )Zdzo_Tibtr   )Zell_Grekr   )eng_Latnr   )Zepo_Latnr   )Zest_Latnr   )Zeus_Latnr   )Zewe_Latnr   )Zfao_Latnr   )Zpes_Arabr   )Zfij_Latnr   )Zfin_Latnr   )Zfon_Latnr   )Zfra_Latnr   )Zfur_Latnr   )Zfuv_Latnr   )Zgla_Latnr   )Zgle_Latnr   )Zglg_Latnr   )Zgrn_Latnr   )Zguj_Gujrr   )Zhat_Latnr   )Zhau_Latnr   )Zheb_Hebrr   )Zhin_Devar   )Zhne_Devar   )Zhrv_Latnr   )Zhun_Latnr   )Zhye_Armnr   )Zibo_Latnr   )Zilo_Latnr   )Zind_Latnr   )Zisl_Latnr   )Zita_Latnr   )Zjav_Latnr   )Zjpn_Jpanr   )Zkab_Latnr   )Zkac_Latnr   )Zkam_Latnr   )Zkan_Kndar   )Zkas_Arabr   )Zkas_Devar   )Zkat_Georr   )Zknc_Arabr   )Zknc_Latnr   )Zkaz_Cyrlr   )Zkbp_Latnr   )Zkea_Latnr   )Zkhm_Khmrr   )Zkik_Latnr   )Zkin_Latnr   )Zkir_Cyrlr   )Zkmb_Latnr   )Zkon_Latnr   )Zkor_Hangr   )Zkmr_Latnr   )Zlao_Laoor   )Zlvs_Latnr   )Zlij_Latnr   )Zlim_Latnr   )Zlin_Latnr   )Zlit_Latnr   )Zlmo_Latnr   )Zltg_Latnr   )Zltz_Latnr   )Zlua_Latnr   )Zlug_Latnr   )Zluo_Latnr   )Zlus_Latnr   )Zmag_Devar   )Zmai_Devar   )Zmal_Mlymr   )Zmar_Devar   )Zmin_Latnr   )Zmkd_Cyrlr   )Zplt_Latnr   )Zmlt_Latnr   )Zmni_Bengr   )Zkhk_Cyrlr   )Zmos_Latnr   )Zmri_Latnr   )Zzsm_Latnr   )Zmya_Mymrr   )Znld_Latnr   )Znno_Latnr   )Znob_Latnr   )Znpi_Devar   )Znso_Latnr   )Znus_Latnr   )Znya_Latnr   )Zoci_Latnr   )Zgaz_Latnr   )Zory_Oryar   )Zpag_Latnr   )Zpan_Gurur   )Zpap_Latnr   )Zpol_Latnr   )Zpor_Latnr   )Zprs_Arabr   )Zpbt_Arabr   )Zquy_Latnr   )Zron_Latnr   )Zrun_Latnr   )Zrus_Cyrlr   )Zsag_Latnr   )Zsan_Devar   )Zsat_Bengr   )Zscn_Latnr   )Zshn_Mymrr   )Zsin_Sinhr   )Zslk_Latnr   )Zslv_Latnr   )Zsmo_Latnr   )Zsna_Latnr   )Zsnd_Arabr   )Zsom_Latnr   )Zsot_Latnr   )Zspa_Latnr   )Zals_Latnr   )Zsrd_Latnr   )Zsrp_Cyrlr   )Zssw_Latnr   )Zsun_Latnr   )Zswe_Latnr   )Zswh_Latnr   )Zszl_Latnr   )Ztam_Tamlr   )Ztat_Cyrlr   )Ztel_Telur   )Ztgk_Cyrlr   )Ztgl_Latnr   )Ztha_Thair   )Ztir_Ethir   )Ztaq_Latnr   )Ztaq_Tfngr   )Ztpi_Latnr   )Ztsn_Latnr   )Ztso_Latnr   )Ztuk_Latnr   )Ztum_Latnr   )Ztur_Latnr   )Ztwi_Latnr   )Ztzm_Tfngr   )Zuig_Arabr   )Zukr_Cyrlr   )Zumb_Latnr   )Zurd_Arabr   )Zuzn_Latnr   )Zvec_Latnr   )Zvie_Latnr   )Zwar_Latnr   )Zwol_Latnr   )Zxho_Latnr   )Zydd_Hebrr   )Zyor_Latnr   )Zyue_Hantr   )Zzho_Hansr   )Zzho_Hantr   )Zzul_Latnr   r   r   r   r   r   r   r.     s                                                                                                                                                                                                            
zNllbConverter.vocabc                 C   s   dS r   r   r   r   r   r   r     s    zNllbConverter.unk_idc                 C   s,   t jddd| jdfd| jdfgdS )Nzeng_Latn $A </s>zeng_Latn $A $B </s>rG  r   r_   r   rR   r   r   r   rs     s    zNllbConverter.post_processorNr   r   r   r   r   rF    s   rF  c                   @   s$   e Zd Zdd Zdd Zdd ZdS )XLMRobertaConverterc                 C   s6   ddddg}|dd |j dd  D 7 }|dg7 }|S )	Nr   r   r   r   c                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r5     s     z-XLMRobertaConverter.vocab.<locals>.<listcomp>r   r   r   r   r   r   r   r.     s    
zXLMRobertaConverter.vocabc                 C   s   d}|S r   r   r   r   r   r   r     s    zXLMRobertaConverter.unk_idc                 C   s,   t jddd| jdfd| jdfgdS r   r   rR   r   r   r   rs     s    z"XLMRobertaConverter.post_processorNr   r   r   r   r   rH  
  s   rH  c                   @   s$   e Zd Zdd Zdd Zdd ZdS )XLNetConverterc                 C   s   dd |j D S )Nc                 S   s2   g | ]*}t |jr|j|jfn|j|jd  fqS r   r   r   r   r   r   r5   '  s   z(XLNetConverter.vocab.<locals>.<listcomp>r   r   r   r   r   r.   &  s    zXLNetConverter.vocabc                 C   s   t ddt ddg}| jjs<|t   |t   | jjrR|t   |j	j
}|rn|t | |t tdd t |S r   r   r   r   r   r   rk   ,  s    

zXLNetConverter.normalizerc                 C   s,   t jddd| jdfd| jdfgdS )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r_   r   rR   r   r   r   rs   ?  s    zXLNetConverter.post_processorNr   r   r   r   r   rI  %  s   rI  c                   @   s   e Zd ZdS )ReformerConverterNrB   rC   rD   r   r   r   r   rJ  J  s   rJ  c                   @   s   e Zd Zdd Zdd ZdS )RemBertConverterc                 C   s   t ddt ddt tddg}| jjsJ|t   |t   | jjr`|t 	  |j
j}|r||t | t |S r   )r
   r   r   rP   r   r:   r   r   ri   r   r   r   r   r   r   r   r   r   rk   P  s    

zRemBertConverter.normalizerc                 C   s,   t jddd| jdfd| jdfgdS r   r   rR   r   r   r   rs   c  s    zRemBertConverter.post_processorN)rB   rC   rD   rk   rs   r   r   r   r   rL  N  s   rL  c                   @   s   e Zd ZdS )BertGenerationConverterNrK  r   r   r   r   rM  n  s   rM  c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
PegasusConverterc                 C   s   | j jdf| j jdfg}| j jd k	r6|| j jdfg7 }| j jd k	rd| j j| j jk rd|| j jdfg7 }|dd td| j jD 7 }|dd |jdd  D 7 }|S )Nr   c                 S   s   g | ]}d | ddfqS )z<unk_>g      Yr   r*   r   r   r   r   r5     s     z*PegasusConverter.vocab.<locals>.<listcomp>r2   c                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r5     s     )	rP   Z	pad_token	eos_tokenZmask_token_sentZ
mask_tokenZmask_token_idoffsetr6   r   r   r   r   r   r.   s  s    


zPegasusConverter.vocabc                 C   s   |j j| jj S rO   )r   r   rP   rR  r   r   r   r   r     s    zPegasusConverter.unk_idc                 C   s   t t  t j||dgS r   )r   r   ZWhitespaceSplitr   r   r   r   r   rm     s
    zPegasusConverter.pre_tokenizerc                 C   s0   | j j}|| j jfg}tjd|gdd|g|dS )N$A$Br_   )rP   rQ  eos_token_idr   rr   )r&   eosrb   r   r   r   rs     s    
zPegasusConverter.post_processorN)rB   rC   rD   r.   r   rm   rs   r   r   r   r   rN  r  s   rN  c                   @   s   e Zd Zdd Zdd ZdS )T5Converterc                 C   s:   | j j}dd |jD }|dd t|d ddD 7 }|S )Nc                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r5     s     z%T5Converter.vocab.<locals>.<listcomp>c                 S   s   g | ]}d | ddfqS )z
<extra_id_rO  r   r   rP  r   r   r   r5     s     r   rI   )rP   Z
_extra_idsr   r6   )r&   r   Znum_extra_idsr.   r   r   r   r.     s    zT5Converter.vocabc                 C   s*   t jddgddddgd| jdfgdS )NrS  r   rT  r_   r   rR   r   r   r   rs     s    
zT5Converter.post_processorN)rB   rC   rD   r.   rs   r   r   r   r   rW    s   rW  c                   @   s   e Zd ZedddZdS )WhisperConverterr(   c           	   	   C   s   | j j}t| j j }tt||d dddd}tj| j j	d|_
t |_| j j}| j |}| j j}| j j}ddd |D }tj| d| d	| d
| d||fft||d|_|S )Nr   Fr   r   r}   c                 S   s   g | ]}| d qS )r\   r   )r*   tokenr   r   r   r5     s     z.WhisperConverter.converted.<locals>.<listcomp>z $A:0 r\   z $A:0 $B:1 r^   r_   )rP   r   r   r   r   r   r   r   r   r   rm   r	   rt   Zprefix_tokensZconvert_ids_to_tokensrQ  rU  joinr   rr   ziprs   )	r&   r.   r>   rv   Zprefix_token_idsprefixesrV  rU  Zprefix_templater   r   r   rS     s8    
	zWhisperConverter.convertedNry   r   r   r   r   rX    s   rX  c                   @   s   e Zd Zdd ZdS )BigBirdConverterc                 C   s,   t jddd| jdfd| jdfgdS r   r   rR   r   r   r   rs     s    zBigBirdConverter.post_processorN)rB   rC   rD   rs   r   r   r   r   r]    s   r]  c                   @   s   e Zd ZedddZdS )CLIPConverterr(   c              
   C   s   | j j}t| j j }| j j}tt||d dddt|d}t	
t	 t	tddt	 g|_t
tjtddd	d
tjddg|_t |_tj| j j| j jf| j j| j jfddd|_|S )Nr   r   Fr.   r>   r   r   r   r   rV   z\s+r}   z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedT)r   invertr   r   )rP   r   r   r   r   rV   r   r   rF   r
   r   NFCr   r   r   rk   r   ZSplitr   rm   r	   rt   r   r   rQ  rU  r   r   rs   r   r   r   r   rS     sD    


zCLIPConverter.convertedNry   r   r   r   r   r^    s   r^  c                   @   s   e Zd ZedddZdS )LayoutLMv2Converterr(   c           
      C   s   | j j}tt|t| j jd}d}d}d}t| j drV| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )NrU   FTrW   rX   r[   r\   r]   r^   r_   rc   rd   rf   ru   r   r   r   rS     s:    



zLayoutLMv2Converter.convertedNry   r   r   r   r   rc    s   rc  c                   @   s   e Zd ZedddZdS )BlenderbotConverterr(   c              	   C   st   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjd|j d|j|jfgd|_|S )Nr   Fr   r   z$A:0 r\   )r`   rb   )rP   r   r   r   r   r   r   r   r   r   rm   r	   rt   r   rr   rQ  rU  rs   r   r   r   r   rS   -  s*    

zBlenderbotConverter.convertedNry   r   r   r   r   rd  ,  s   rd  c                   @   s$   e Zd Zdd Zdd Zdd ZdS )XGLMConverterc                 C   sB   ddddg}|dd |j dd  D 7 }|dd	d
ddddg7 }|S )Nr   r   r   r   c                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r5   Q  s     z'XGLMConverter.vocab.<locals>.<listcomp>r   )z<madeupword0>r   )z<madeupword1>r   )z<madeupword2>r   )z<madeupword3>r   )z<madeupword4>r   )z<madeupword5>r   )z<madeupword6>r   r   r   r   r   r   r.   J  s    zXGLMConverter.vocabc                 C   s   d}|S r   r   r   r   r   r   r   W  s    zXGLMConverter.unk_idc                 C   s,   t jddd| jdfd| jdfgdS )Nz</s> $Az</s> $A </s> </s> $Br   r   r_   r   rR   r   r   r   rs   [  s    zXGLMConverter.post_processorNr   r   r   r   r   re  I  s   re  c                   @   sH   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dS )LlamaConverterTc                 C   s*   dddg}|dd |j dd  D 7 }|S )Nr   r   r   c                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r5   o  s     z(LlamaConverter.vocab.<locals>.<listcomp>r   r   r   r   r   r   r.   i  s    zLlamaConverter.vocabc                 C   s   d}|S )Nr   r   r   r   r   r   r   r  s    zLlamaConverter.unk_idc              	   C   s,   t t ddt  t  t jdddgS )Nr   r}   r   )contentleft)r	   r   r   ZByteFallbackZFuser   r   r   r   r   rt   v  s    
zLlamaConverter.decoderc           	      C   s   |j j}| |}|dkr\dd l}t|jtdk rHtt|d}qtt|ddd}np|dkrt	| j
j|\}}dd t|D }tt|||j jddd	}|td
tdtdg ntd|S )Nr   r   z0.14.0T)r   r2   c                 S   s   i | ]\}\}}||qS r   r   )r*   r   r   Z_scorer   r   r   r,     s    
  z,LlamaConverter.tokenizer.<locals>.<dictcomp>)rV   r   r   r   r   r   r   )r   r   r.   
tokenizersr   r   r   r   r   r    rP   r   rA   r   r   r   r   r   r   )	r&   r   r   r=   ri  rv   r   r>   r   r   r   r   rv     s.    
zLlamaConverter.tokenizerc                 C   s    t t jddt jdddgS )Nr   )prependr}   )patternrg  )r
   r   ZPrependr   r   r   r   r   rk     s
    
zLlamaConverter.normalizerc                 C   s   d S rO   r   r   r   r   r   rm     s    zLlamaConverter.pre_tokenizerc           
      C   s   | j j}| j j}|s|r| j j}| j j}| j j}| j j}|d |  d|rVd| d nd }| d| d |  d|rd| d nd }g }	|r|	||f |r|	||f tj	|||	dS d S d S )	Nr   z$A:0r}   r\   r   r^   z $B:1r_   )
rP   r   Zadd_eos_tokenr   r   rQ  rU  r:   r   rr   )
r&   Zadd_bosZadd_eosr   r   rV  rU  r`   ra   rb   r   r   r   rs     s     &.zLlamaConverter.post_processorN)rB   rC   rD   r   r.   r   rt   rv   rk   rm   rs   r   r   r   r   rf  f  s   	
rf  c                   @   s   e Zd ZedddZdS )MarkupLMConverterr(   c           	   
   C   s   | j }|j}t|j }tt||d ddd| j jd}tj	|j
d|_t	 |_t| j j}t| j j}| j j}| j j}tj| d| | d| d| ||f||fgd|_|S )Nr   Fr_  r   z $A z $B r_   )rP   r   r   r   r   r   r   rV   r   r   r   rm   r	   rt   rF   rn   ro   rp   rq   r   rr   rs   )	r&   r   r.   r>   rv   rw   rx   rp   rq   r   r   r   rS     s8    
	zMarkupLMConverter.convertedNry   r   r   r   r   rl    s   rl  )4ZAlbertTokenizerZBartTokenizerZBarthezTokenizerZBertTokenizerZBigBirdTokenizerZBlenderbotTokenizerZCamembertTokenizerZCLIPTokenizerZCodeGenTokenizerZConvBertTokenizerZDebertaTokenizerZDebertaV2TokenizerZDistilBertTokenizerZDPRReaderTokenizerZDPRQuestionEncoderTokenizerZDPRContextEncoderTokenizerZElectraTokenizerZFNetTokenizerZFunnelTokenizerZGPT2TokenizerZHerbertTokenizerZLayoutLMTokenizerZLayoutLMv2TokenizerZLayoutLMv3TokenizerZLayoutXLMTokenizerZLongformerTokenizerZLEDTokenizerZLxmertTokenizerZMarkupLMTokenizerZMBartTokenizerZMBart50TokenizerZMPNetTokenizerZMobileBertTokenizerZMvpTokenizerZNllbTokenizerZOpenAIGPTTokenizerZPegasusTokenizerZRealmTokenizerZReformerTokenizerZRemBertTokenizerZRetriBertTokenizerZRobertaTokenizerZRoFormerTokenizerZSqueezeBertTokenizerZT5TokenizerZWhisperTokenizerZXLMRobertaTokenizerZXLNetTokenizerZSplinterTokenizerZXGLMTokenizerZLlamaTokenizerZCodeLlamaTokenizerr(   c                 C   s@   | j j}|tkr,td| dtt  t| }||  S )a  
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

    Args:
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    zAn instance of tokenizer class zv cannot be converted in a Fast tokenizer instance. No converter was found. Currently available slow->fast convertors: )r   rB   SLOW_TO_FAST_CONVERTERS
ValueErrorr   r   rS   )Ztransformer_tokenizerZtokenizer_class_nameZconverter_classr   r   r   convert_slow_tokenizer%  s    ro  )r   )BrE   r   typingr   r   r   	packagingr   ri  r   r   r   r	   r
   r   r   Ztokenizers.modelsr   r   r   utilsr   r   Zutils.import_utilsr   r   r    rF   boolrM   rN   rT   rz   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r-  rF  rH  rI  rJ  rL  rM  rN  rW  rX  r]  r^  rc  rd  re  rf  rl  rm  ro  r   r   r   r   <module>   s   $
('2''#'^% 5% '$+'a'8