U
    ,-e5                     @   s   d Z ddlZddlZddlmZmZmZ ddlmZ ddl	m
Z
 e
eZddd	Zd
did
did	Zd
diZdd ZG dd deZdS )z Tokenization classes for BioGPT.    N)ListOptionalTuple   )PreTrainedTokenizer)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filezmicrosoft/biogptz?https://huggingface.co/microsoft/biogpt/resolve/main/vocab.jsonz?https://huggingface.co/microsoft/biogpt/resolve/main/merges.txti   c                 C   s6   t  }| d }| dd D ]}|||f |}q|S )z
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    r      N)setadd)wordpairsZ	prev_charchar r   o/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/biogpt/tokenization_biogpt.py	get_pairs+   s    r   c                       s"  e Zd ZdZeZeZeZ	ddgZ
d- fdd		Zed
d Zdd Zdd Zdd Zdd Zd.ddZdd Zdd Zdd Zd/ee eee  ee ddd Zd0ee eee  eee d! fd"d#Zd1ee eee  ee dd$d%Zd2eee ee d&d'd(Zd)d* Z d+d, Z!  Z"S )3BioGptTokenizera:  
    Construct an FAIRSEQ Transformer tokenizer. Moses tokenization followed by Byte-Pair Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
    Z	input_idsZattention_mask<unk><s></s><pad>c              	      s   zdd l }	W n tk
r(   tdY nX d| _|	| _i | _i | _t|dd}
t|
| _	W 5 Q R X dd | j	
 D | _t|dd}| dd d	 }W 5 Q R X d
d |D }tt|tt|| _i | _t jf |||||d| d S )Nr   zqYou need to install sacremoses to use BioGptTokenizer. See https://pypi.org/project/sacremoses/ for installation.enutf-8encodingc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>   s      z,BioGptTokenizer.__init__.<locals>.<dictcomp>
c                 S   s    g | ]}t | d d qS )N   )tuplesplit)r   merger   r   r   
<listcomp>   s     z,BioGptTokenizer.__init__.<locals>.<listcomp>)	bos_token	eos_token	sep_token	unk_token	pad_token)
sacremosesImportErrorlangsmcache_moses_tokenizercache_moses_detokenizeropenjsonloadencoderitemsdecoderreadr$   dictziprangelen	bpe_rankscachesuper__init__)selfr   r	   r*   r'   r(   r)   r+   kwargsr,   Zvocab_handleZmerges_handleZmerges	__class__r   r   r@   h   s6    
 zBioGptTokenizer.__init__c                 C   s
   t | jS )zReturns vocab size)r<   r5   rA   r   r   r   
vocab_size   s    zBioGptTokenizer.vocab_sizec                 C   s   t | jf| jS )N)r9   r5   Zadded_tokens_encoderrE   r   r   r   	get_vocab   s    zBioGptTokenizer.get_vocabc                 C   s:   || j kr"| jj|d}|| j |< | j | j|ddddS )Nr.   TF)Zaggressive_dash_splitsZ
return_strescape)r0   r/   ZMosesTokenizertokenize)rA   textr.   Zmoses_tokenizerr   r   r   moses_tokenize   s    


   zBioGptTokenizer.moses_tokenizec                 C   s2   || j kr"| jj|d}|| j |< | j | |S )NrH   )r1   r/   ZMosesDetokenizerZ
detokenize)rA   tokensr.   Zmoses_detokenizerr   r   r   moses_detokenize   s    

z BioGptTokenizer.moses_detokenizec           
         s  t |d d |d d f }| jkr2 j| S t|}|sF|d S t| fddd}| jkrhqf|\}}g }d}|t|k r<z|||}	W n, tk
r   |||d   Y q<Y nX ||||	  |	}|| |kr$|t|d k r$||d  |kr$|	||  |d7 }qx|	||  |d7 }qxt |}|}t|dkr\qfqFt|}qFd	
|}|d
kr~d}| j|< |S )Nr!   </w>c                    s    j | tdS )Ninf)r=   getfloat)pairrE   r   r   <lambda>       z%BioGptTokenizer.bpe.<locals>.<lambda>keyr   r
   r"    z
  </w>z
</w>)r#   r>   r   minr=   r<   index
ValueErrorextendappendjoin)
rA   tokenr   r   ZbigramfirstsecondZnew_wordijr   rE   r   bpe   sF    


2





zBioGptTokenizer.bpeFc                 C   sL   |r|  }n| || j}g }|D ]"}|r$|t| | d q$|S )zReturns a tokenized string.rX   )r$   rL   r.   r\   listrd   )rA   rK   Zbypass_tokenizerZsplit_tokensr_   r   r   r   	_tokenize   s    
zBioGptTokenizer._tokenizec                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)r5   rQ   r*   )rA   r_   r   r   r   _convert_token_to_id   s    z$BioGptTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)r7   rQ   r*   )rA   rZ   r   r   r   _convert_id_to_token   s    z$BioGptTokenizer._convert_id_to_tokenc                 C   s.   dd |D }d | }| || j}|S )z:Converts a sequence of tokens (string) in a single string.c                 S   s    g | ]}| d d dd qS )rX    rO   )replace)r   tr   r   r   r&      s     z<BioGptTokenizer.convert_tokens_to_string.<locals>.<listcomp>ri   )r^   r$   rN   r.   )rA   rM   rK   r   r   r   convert_tokens_to_string   s    z(BioGptTokenizer.convert_tokens_to_stringN)token_ids_0token_ids_1returnc                 C   s,   |dkr| j g| S | j g}|| | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BioGPT sequence has the following format:

        - single sequence: `</s> X `
        - pair of sequences: `</s> A </s> B `

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)sep_token_idrA   rm   rn   sepr   r   r    build_inputs_with_special_tokens   s    z0BioGptTokenizer.build_inputs_with_special_tokens)rm   rn   already_has_special_tokensro   c                    sZ   |rt  j||ddS |dk	rFdgdgt|  dg dgt|  S dgdgt|  S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rm   rn   rt   Nr
   r   )r?   get_special_tokens_maskr<   )rA   rm   rn   rt   rC   r   r   ru     s      (z'BioGptTokenizer.get_special_tokens_maskc                 C   sF   | j g}|dkr"t|| dg S t|| dg t|| dg  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A FAIRSEQ
        Transformer sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        Nr   r
   )rp   r<   rq   r   r   r   $create_token_type_ids_from_sequences(  s    z4BioGptTokenizer.create_token_type_ids_from_sequences)save_directoryfilename_prefixro   c           
   	   C   s  t j|s"td| d d S t j||r6|d ndtd  }t j||rX|d ndtd  }t|ddd	$}|t	j
| jd
dddd  W 5 Q R X d}t|ddd	`}t| j dd dD ]B\}}	||	krtd| d |	}|d|d  |d7 }qW 5 Q R X ||fS )NzVocabulary path (z) should be a directory-ri   r   r	   wr   r   r"   TF)indent	sort_keysensure_asciir    r   c                 S   s   | d S )Nr
   r   )kvr   r   r   rT   V  rU   z1BioGptTokenizer.save_vocabulary.<locals>.<lambda>rV   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!rX   r
   )ospathisdirloggererrorr^   VOCAB_FILES_NAMESr2   writer3   dumpsr5   sortedr=   r6   warning)
rA   rw   rx   r   Z
merge_filefrZ   writerZ
bpe_tokensZtoken_indexr   r   r   save_vocabularyF  s0      (
zBioGptTokenizer.save_vocabularyc                 C   s   | j  }d |d< |S )Nr/   )__dict__copy)rA   stater   r   r   __getstate__b  s    
zBioGptTokenizer.__getstate__c                 C   s:   || _ zdd l}W n tk
r.   tdY nX || _d S )Nr   znYou need to install sacremoses to use XLMTokenizer. See https://pypi.org/project/sacremoses/ for installation.)r   r,   r-   r/   )rA   dr,   r   r   r   __setstate__g  s    
zBioGptTokenizer.__setstate__)r   r   r   r   r   )F)N)NF)N)N)#__name__
__module____qualname____doc__r   Zvocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesZmodel_input_namesr@   propertyrF   rG   rL   rN   rd   rf   rg   rh   rl   r   intr   rs   boolru   rv   strr   r   r   r   __classcell__r   r   rC   r   r   8   sX   *     ,
,

  
    
   
r   )r   r3   r   typingr   r   r   Ztokenization_utilsr   utilsr   Z
get_loggerr   r   r   r   r   r   r   r   r   r   r   <module>   s&   
  