U
    ,È-e*!  ã                8   @   s  d Z ddlZddlZddlmZmZ ddlZddlm	Z	 ddl
mZ e e¡Zddd	œZd
did
did	œZd
diZddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEœ7ZdFdG„ ZG dHdI„ dIe	ƒZdS )Jz)Tokenization classes for Salesforce CTRL.é    N)ÚOptionalÚTupleé   )ÚPreTrainedTokenizer)Úloggingz
vocab.jsonz
merges.txt)Ú
vocab_fileÚmerges_fileZctrlzHhttps://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.jsonzHhttps://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txté   iµ’ iû  i·Ÿ iÐ÷  i»ö  i#j  iñv iµ~ i6²  iÅÁ  ivÌ  iò iØ.  iï iè½  i×š  iÍ¨  i§¯  i%æ  i¦  iø  i3  iR- in  iS.  iK  iñ iwÌ  iÁ´  i[  i*  i¡“  iœì  iÚ/  iè?  iñí in1  i  ip i€  i„ iòÉ iÏ’  i	  i) i-‘ iœ( iºø  i™K iîÕ  iŒ iÇ¢  i   iÄh  i–õ )7Z	PregnancyZChristianityZExplainZFitnessZSavingZAskZAssZJokeZ	QuestionsZThoughtsZRetailZFeminismZWritingZAtheismZNetflixZ	ComputingZOpinionZAloneÚFunnyZGamingZHumanZIndiaZJokerZDietZLegalZNormanZTipZWeightZMoviesZRunningZScienceZHorrorZ
ConfessionZFinanceZPoliticsZScaryZSupportZTechnologiesZTeenageÚEventZLearnedZNotionZ	WikipediaZBooksZExtractZConfessionsZ
ConspiracyZLinksZ	NarcissusZRelationshipZRelationshipsZReviewsZNewsZTranslationZmultilingualc                 C   s>   t ƒ }| d }| dd… D ]}| ||f¡ |}qt |ƒ}|S )z…
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   é   N)ÚsetÚadd)ÚwordÚpairsZ	prev_charÚchar© r   úk/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/ctrl/tokenization_ctrl.pyÚ	get_pairsg   s    r   c                       sŠ   e Zd ZdZeZeZeZ	e
Zd‡ fdd„	Zedd„ ƒZdd„ Zd	d
„ Zdd„ Zdd„ Zdd„ Zdd„ Zdeee ee dœdd„Z‡  ZS )ÚCTRLTokenizera`  
    Construct a CTRL tokenizer. Based on Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    ú<unk>c              	      s®   t |dd}t |¡| _W 5 Q R X dd„ | j ¡ D ƒ| _t |dd}| ¡  d¡dd… }W 5 Q R X dd	„ |D ƒ}tt	|t
t|ƒƒƒƒ| _i | _tƒ jf d
|i|—Ž d S )Núutf-8©Úencodingc                 S   s   i | ]\}}||“qS r   r   )Ú.0ÚkÚvr   r   r   Ú
<dictcomp>   s      z*CTRLTokenizer.__init__.<locals>.<dictcomp>Ú
r   éÿÿÿÿc                 S   s   g | ]}t | ¡ ƒ‘qS r   )ÚtupleÚsplit)r   Úmerger   r   r   Ú
<listcomp>“   s     z*CTRLTokenizer.__init__.<locals>.<listcomp>Ú	unk_token)ÚopenÚjsonÚloadÚencoderÚitemsÚdecoderÚreadr!   ÚdictÚzipÚrangeÚlenÚ	bpe_ranksÚcacheÚsuperÚ__init__)Úselfr   r   r$   ÚkwargsZvocab_handleZmerges_handleZmerges©Ú	__class__r   r   r3      s     zCTRLTokenizer.__init__c                 C   s
   t | jƒS ©N)r/   r(   ©r4   r   r   r   Ú
vocab_size˜   s    zCTRLTokenizer.vocab_sizec                 C   s   t | jf| jŽS r8   )r,   r(   Zadded_tokens_encoderr9   r   r   r   Ú	get_vocabœ   s    zCTRLTokenizer.get_vocabc           
         s’  |ˆ j krˆ j | S t|ƒ}tt|d d… ƒ|d d g ƒ}t|ƒ}|sN|S t|‡ fdd„d}|ˆ jkrpqn|\}}g }d}|t|ƒk rDz| ||¡}	W n, tk
rÊ   | 	||d … ¡ Y qDY nX | 	|||	… ¡ |	}|| |kr,|t|ƒd k r,||d  |kr,| 
|| ¡ |d7 }q€| 
|| ¡ |d7 }q€t|ƒ}|}t|ƒdkrdqnqNt|ƒ}qNd	 |¡}|d d
… }|ˆ j |< |S )Nr   z</w>c                    s   ˆ j  | tdƒ¡S )NÚinf)r0   ÚgetÚfloat)Úpairr9   r   r   Ú<lambda>ª   ó    z#CTRLTokenizer.bpe.<locals>.<lambda>©Úkeyr   r   é   ú@@ éüÿÿÿ)r1   r    Úlistr   Úminr0   r/   ÚindexÚ
ValueErrorÚextendÚappendÚjoin)
r4   Útokenr   r   ZbigramÚfirstÚsecondZnew_wordÚiÚjr   r9   r   ÚbpeŸ   sF    

"
2




zCTRLTokenizer.bpec                 C   s8   g }t  d|¡}|D ]}| t|  |¡ d¡ƒ¡ q|S )zTokenize a string.z\S+\n?ú )ÚreÚfindallrK   rG   rS   r!   )r4   ÚtextZsplit_tokensÚwordsrN   r   r   r   Ú	_tokenizeË   s
    zCTRLTokenizer._tokenizec                 C   s   | j  || j  | j¡¡S )z0Converts a token (str) in an id using the vocab.)r(   r=   r$   )r4   rN   r   r   r   Ú_convert_token_to_idÕ   s    z"CTRLTokenizer._convert_token_to_idc                 C   s   | j  || j¡S )z=Converts an index (integer) in a token (str) using the vocab.)r*   r=   r$   )r4   rI   r   r   r   Ú_convert_id_to_tokenÙ   s    z"CTRLTokenizer._convert_id_to_tokenc                 C   s   d  |¡ dd¡ ¡ }|S )z:Converts a sequence of tokens (string) in a single string.rT   rE   Ú )rM   ÚreplaceÚstrip)r4   ÚtokensZ
out_stringr   r   r   Úconvert_tokens_to_stringÝ   s    z&CTRLTokenizer.convert_tokens_to_stringN)Úsave_directoryÚfilename_prefixÚreturnc           
   	   C   s(  t j |¡s"t d|› d¡ d S t j ||r6|d ndtd  ¡}t j ||rX|d ndtd  ¡}t|ddd	$}| t	j
| jd
dddd ¡ W 5 Q R X d}t|ddd	j}| d¡ t| j ¡ dd„ dD ]B\}}	||	krøt d|› d¡ |	}| d |¡d ¡ |d7 }qÒW 5 Q R X ||fS )NzVocabulary path (z) should be a directoryú-r\   r   r   Úwr   r   rD   TF)ÚindentÚ	sort_keysÚensure_asciir   r   z#version: 0.2
c                 S   s   | d S )Nr   r   )Úkvr   r   r   r@   ó   rA   z/CTRLTokenizer.save_vocabulary.<locals>.<lambda>rB   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!rT   r   )ÚosÚpathÚisdirÚloggerÚerrorrM   ÚVOCAB_FILES_NAMESr%   Úwriter&   Údumpsr(   Úsortedr0   r)   Úwarning)
r4   ra   rb   r   Z
merge_fileÚfrI   ÚwriterZ
bpe_tokensZtoken_indexr   r   r   Úsave_vocabularyâ   s2     ÿ ÿ(

ÿzCTRLTokenizer.save_vocabulary)r   )N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__ro   Zvocab_files_namesÚPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_mapÚ&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesÚCONTROL_CODESÚcontrol_codesr3   Úpropertyr:   r;   rS   rY   rZ   r[   r`   Ústrr   r   rv   Ú__classcell__r   r   r6   r   r   w   s   
,
r   )rz   r&   rj   Útypingr   r   ÚregexrU   Ztokenization_utilsr   Úutilsr   Z
get_loggerrw   rm   ro   r{   r|   r}   r   r   r   r   r   r   Ú<module>   s”   
þþ ÿÉ;