U
    ,-e5                     @   s   d Z ddlZddlmZmZmZ ddlmZ ddlm	Z	 ddl
mZ eeZdd	iZdd
ddiZdddZdd ZG dd deZdS )zTokenization classes for ESM.    N)ListOptionalUnion   )PreTrainedTokenizer)
AddedToken)logging
vocab_file	vocab.txtzGhttps://huggingface.co/facebook/esm2_t6_8M_UR50D/resolve/main/vocab.txtzIhttps://huggingface.co/facebook/esm2_t12_35M_UR50D/resolve/main/vocab.txt)zfacebook/esm2_t6_8M_UR50Dzfacebook/esm2_t12_35M_UR50Di   c              
   C   s<   t | d(}|  }dd |D W  5 Q R  S Q R X d S )Nrc                 S   s   g | ]}|  qS  )strip).0lr   r   i/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/esm/tokenization_esm.py
<listcomp>,   s     z#load_vocab_file.<locals>.<listcomp>)openread
splitlines)r	   flinesr   r   r   load_vocab_file)   s    r   c                       s  e Zd ZdZeZeZeZ	ddgZ
d+ fd	d
	ZeedddZeedddZdd Zd,ddZdd ZeedddZeedddZd-ee eee  ee dddZd.eee eee d d!d"Zd#d$ Zeed%d&d'Zd/eee ee f eed( fd)d*Z  Z S )0EsmTokenizerz&
    Constructs an ESM tokenizer.
    Z	input_idsZattention_mask<unk><cls><pad><mask><eos>c                    sf   t || _tt| j| _dd t| jD | _t jf |||||d| | j| _| 	| j d S )Nc                 S   s   i | ]\}}||qS r   r   )r   indtokr   r   r   
<dictcomp>E   s      z)EsmTokenizer.__init__.<locals>.<dictcomp>)	unk_token	cls_token	pad_token
mask_token	eos_token)
r   
all_tokensdict	enumerate_id_to_token_token_to_idsuper__init__Zunique_no_split_tokensZ_update_trie)selfr	   r!   r"   r#   r$   r%   kwargs	__class__r   r   r,   9   s    

zEsmTokenizer.__init__)indexreturnc                 C   s   | j || jS Nr)   getr!   r-   r1   r   r   r   _convert_id_to_tokenU   s    z!EsmTokenizer._convert_id_to_token)tokenr2   c                 C   s   | j || j | jS r3   r*   r5   r!   r-   r8   r   r   r   _convert_token_to_idX   s    z!EsmTokenizer._convert_token_to_idc                 K   s   |  S r3   )split)r-   textr.   r   r   r   	_tokenize[   s    zEsmTokenizer._tokenizeFc                 C   s
   t | jS r3   )lenr)   )r-   with_added_tokensr   r   r   get_vocab_size^   s    zEsmTokenizer.get_vocab_sizec                 C   s   dd t | jD S )Nc                 S   s   i | ]\}}||qS r   r   )r   ir8   r   r   r   r    b   s      z*EsmTokenizer.get_vocab.<locals>.<dictcomp>)r(   r&   r-   r   r   r   	get_vocaba   s    zEsmTokenizer.get_vocabc                 C   s   | j || j | jS r3   r9   r:   r   r   r   token_to_idd   s    zEsmTokenizer.token_to_idc                 C   s   | j || jS r3   r4   r6   r   r   r   id_to_tokeng   s    zEsmTokenizer.id_to_tokenN)token_ids_0token_ids_1r2   c                 C   s^   | j g}| jg}|d kr8| jd kr*|| S || | S n| jd krJtd|| | | | S )Nz=Cannot tokenize multiple sequences when EOS token is not set!)Zcls_token_idZeos_token_id
ValueError)r-   rG   rH   clssepr   r   r    build_inputs_with_special_tokensj   s    

z-EsmTokenizer.build_inputs_with_special_tokens)rG   rH   already_has_special_tokensr2   c                    sd   |r&|dk	rt d fdd|D S dgdgt|  dg }|dk	r`|dgt| dg 7 }|S )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of ids of the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                List of ids of the second sequence.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        NzYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.c                    s   g | ]}| j krd ndqS )   r   )Zall_special_ids)r   r8   rC   r   r   r      s     z8EsmTokenizer.get_special_tokens_mask.<locals>.<listcomp>rN   r   )rI   r?   )r-   rG   rH   rM   maskr   rC   r   get_special_tokens_maskx   s    z$EsmTokenizer.get_special_tokens_maskc              	   C   sL   t j||r|d ndd }t|d}|d| j W 5 Q R X |fS )N- r
   w
)ospathjoinr   writer&   )r-   Zsave_directoryZfilename_prefixr	   r   r   r   r   save_vocabulary   s    zEsmTokenizer.save_vocabulary)r2   c                 C   s   | j ddS )NF)r@   )rA   rC   r   r   r   
vocab_size   s    zEsmTokenizer.vocab_size)
new_tokensspecial_tokensr2   c                    s   t  j|ddS )NT)r\   )r+   _add_tokens)r-   r[   r\   r/   r   r   r]      s    zEsmTokenizer._add_tokens)r   r   r   r   r   )F)N)NF)F)!__name__
__module____qualname____doc__VOCAB_FILES_NAMESZvocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesZmodel_input_namesr,   intstrr7   r;   r>   rA   rD   rE   rF   r   r   rL   boolrP   rY   propertyrZ   r   r   r]   __classcell__r   r   r/   r   r   /   sH        
  
     r   )ra   rU   typingr   r   r   Ztokenization_utilsr   Ztokenization_utils_baser   utilsr   Z
get_loggerr^   loggerrb   rc   rd   r   r   r   r   r   r   <module>   s"   
