U
    9%eT                     @   s   d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z
ddlmZmZmZ ddlmZ eeZdZdd	d
ZddddddddddddddddiZdddddddddddddddZdddgddddd d!d"gd#Zddddd d!d"d$ZG d%d& d&eZdS )'    N)copyfile)AnyDictListOptionalTuple   )
AddedTokenBatchEncodingPreTrainedTokenizer)loggingu   ▁zsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_filer   zOhttps://huggingface.co/uclanlp/plbart-base/resolve/main/sentencepiece.bpe.modelzahttps://huggingface.co/uclanlp/plbart-c-cpp-defect-detection/resolve/main/sentencepiece.bpe.modelzRhttps://huggingface.co/uclanlp/plbart-cs-java/resolve/main/sentencepiece.bpe.modelzUhttps://huggingface.co/uclanlp/plbart-en_XX-java/resolve/main/sentencepiece.bpe.modelzShttps://huggingface.co/uclanlp/plbart-go-en_XX/resolve/main/sentencepiece.bpe.modelz_https://huggingface.co/uclanlp/plbart-java-clone-detection/resolve/main/sentencepiece.bpe.modelzRhttps://huggingface.co/uclanlp/plbart-java-cs/resolve/main/sentencepiece.bpe.modelzUhttps://huggingface.co/uclanlp/plbart-java-en_XX/resolve/main/sentencepiece.bpe.modelz[https://huggingface.co/uclanlp/plbart-javascript-en_XX/resolve/main/sentencepiece.bpe.modelzThttps://huggingface.co/uclanlp/plbart-php-en_XX/resolve/main/sentencepiece.bpe.modelzWhttps://huggingface.co/uclanlp/plbart-python-en_XX/resolve/main/sentencepiece.bpe.modelz]https://huggingface.co/uclanlp/plbart-refine-java-medium/resolve/main/sentencepiece.bpe.modelz\https://huggingface.co/uclanlp/plbart-refine-java-small/resolve/main/sentencepiece.bpe.modelzUhttps://huggingface.co/uclanlp/plbart-ruby-en_XX/resolve/main/sentencepiece.bpe.model)zuclanlp/plbart-basez%uclanlp/plbart-c-cpp-defect-detectionzuclanlp/plbart-cs-javazuclanlp/plbart-en_XX-javazuclanlp/plbart-go-en_XXz#uclanlp/plbart-java-clone-detectionzuclanlp/plbart-java-cszuclanlp/plbart-java-en_XXzuclanlp/plbart-javascript-en_XXzuclanlp/plbart-php-en_XXzuclanlp/plbart-python-en_XXz!uclanlp/plbart-refine-java-mediumz uclanlp/plbart-refine-java-smallzuclanlp/plbart-ruby-en_XXi   Z__java__Z
__python__	__en_XX__Z__javascript__Z__php__Z__ruby__Z__go__)basemulti)javapythonen_XX
javascriptphprubygoc                       s  e Zd ZU dZeZeZeZ	ddgZ
g Zee ed< g Zee ed< dEeeeef  d fddZdd Zdd Zedd ZeedddZejeddddZdFee eee  eee d fddZdGee eee  ee dd d!ZdHee eee  ee dd"d#Zeee ee d$d%d&Zd'd( Z eee d)d*d+Z!d,d- Z"d.d/ Z#d0d1 Z$dIeee e%e d2d3d4Z&dJee eeee  ee'd7 fd8d9Z(d:d; Z)d<d= Z*ddd>d?Z+edd@dAdBZ,eed@dCdDZ-  Z.S )KPLBartTokenizera  
    Construct an PLBART tokenizer.

    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        src_lang (`str`, *optional*):
            A string representing the source language.
        tgt_lang (`str`, *optional*):
            A string representing the target language.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The start of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The cls token, which is a special token used as the first token for all tasks.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token(`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masking tasks. This
            is only used in the `"base"` tokenizer type. For `"multi"` tokenizer, masking is never done for the
            downstream tasks.
        language_codes (`str`, *optional*, defaults to `"base"`):
            What language codes to use. Should be one of `"base"` or `"multi"`.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:
            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.
            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Examples:

    ```python
    >>> from transformers import PLBartTokenizer

    >>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX")
    >>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
    >>> expected_translation_english = "Returns the maximum value of a b c."
    >>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt")
    ```Z	input_idsZattention_maskprefix_tokenssuffix_tokens<s></s><unk><pad><mask>r   N)sp_model_kwargsc                    s  t |trt|dddn|}|d kr(i n|_|}|}tjf j_jt| |_	|	_
tj
 }ddddd_d_tj_fd	d
t|D _dd
 j D _j
dkrtjtj j jd< jj dd
 j D _tj  |d k	rF  fdd|D  j
dkrz|_jd k	rpjj nj_n"|d k	r|nd_jj _t jf ||||||||	|
|| jd| |_j d S )NTF)lstriprstripr         r   )r   r   r   r   c                    s"   i | ]\}}| j |  j qS  )sp_model_sizefairseq_offset).0icodeselfr&   m/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/plbart/tokenization_plbart.py
<dictcomp>   s     z,PLBartTokenizer.__init__.<locals>.<dictcomp>c                 S   s   i | ]\}}||qS r&   r&   r)   kvr&   r&   r.   r/      s      r   r    c                 S   s   i | ]\}}||qS r&   r&   r0   r&   r&   r.   r/      s      c                    s   g | ]}| kr|qS r&   r&   )r)   t)_additional_special_tokensr&   r.   
<listcomp>   s      z,PLBartTokenizer.__init__.<locals>.<listcomp>r   )	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_tokenlanguage_codesr   src_langtgt_langadditional_special_tokensr!   )
isinstancestrr	   r!   !_convert_lang_code_special_formatspmSentencePieceProcessorsp_modelLoadr   r=   FAIRSEQ_LANGUAGE_CODESfairseq_tokens_to_idsr(   lenr'   	enumeratelang_code_to_iditemsZid_to_lang_codeupdatefairseq_ids_to_tokenslistkeysextend	_src_langZcur_lang_code_idsuper__init__r?   set_src_lang_special_tokens)r-   r   r6   r7   r9   r:   r8   r;   r<   r=   r   r>   r?   r!   r@   kwargsZfairseq_language_codes	__class__)r4   r-   r.   rU      sd    


	

 
zPLBartTokenizer.__init__c                 C   s$   | j  }d |d< | j |d< |S )NrF   sp_model_proto)__dict__copyrF   serialized_model_proto)r-   stater&   r&   r.   __getstate__  s    
zPLBartTokenizer.__getstate__c                 C   s8   || _ t| dsi | _tjf | j| _| j| j d S )Nr!   )r[   hasattrr!   rD   rE   rF   ZLoadFromSerializedProtorZ   )r-   dr&   r&   r.   __setstate__  s
    
zPLBartTokenizer.__setstate__c                 C   sF   | j dkr(t| jt| j | j d S t| jt| j | j S d S )Nr   r$   )r=   rJ   rF   rL   r(   r,   r&   r&   r.   
vocab_size  s    
zPLBartTokenizer.vocab_size)returnc                 C   s   | j S N)rS   r,   r&   r&   r.   r>   %  s    zPLBartTokenizer.src_lang)new_src_langrd   c                 C   s    |  |}|| _| | j d S re   )rC   rS   rV   )r-   rf   r&   r&   r.   r>   )  s    
F)token_ids_0token_ids_1already_has_special_tokensrd   c                    sx   |rt  j||ddS dgt| j }dgt| j }|dkrT|dgt|  | S |dgt|  dgt|  | S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rg   rh   ri   r$   Nr   )rT   get_special_tokens_maskrJ   r   r   )r-   rg   rh   ri   Zprefix_onesZsuffix_onesrX   r&   r.   rj   /  s      z'PLBartTokenizer.get_special_tokens_mask)rg   rh   rd   c                 C   s,   |dkr| j | | j S | j | | | j S )ac  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An PLBART sequence has the following format, where `X` represents the sequence:

        - `input_ids` (for encoder) `X [eos, src_lang_code]`
        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)r   r   )r-   rg   rh   r&   r&   r.    build_inputs_with_special_tokensM  s    z0PLBartTokenizer.build_inputs_with_special_tokensc                 C   sP   | j g}| jg}|dkr.t|| | dg S t|| | | | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. PLBart does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )Zsep_token_idZcls_token_idrJ   )r-   rg   rh   sepclsr&   r&   r.   $create_token_type_ids_from_sequencesh  s
    z4PLBartTokenizer.create_token_type_ids_from_sequences)return_tensorsr>   r?   c                 K   s^   |dks|dkrt d| || _| || _| |fd|d|}| | j}||d< |S )zIUsed by translation pipeline, to prepare inputs for the generate functionNzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)Zadd_special_tokensro   Zforced_bos_token_id)
ValueErrorrC   r>   r?   Zconvert_tokens_to_ids)r-   Z
raw_inputsro   r>   r?   extra_kwargsinputsZtgt_lang_idr&   r&   r.   _build_translation_inputs  s    z)PLBartTokenizer._build_translation_inputsc                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r&   )Zconvert_ids_to_tokens)r)   r*   r,   r&   r.   r/     s      z-PLBartTokenizer.get_vocab.<locals>.<dictcomp>)rangerc   rN   Zadded_tokens_encoder)r-   Zvocabr&   r,   r.   	get_vocab  s    zPLBartTokenizer.get_vocab)textrd   c                 C   s   | j j|tdS )N)Zout_type)rF   encoderB   )r-   rv   r&   r&   r.   	_tokenize  s    zPLBartTokenizer._tokenizec                 C   s4   || j kr| j | S | j|}|r.|| j S | jS )z0Converts a token (str) in an id using the vocab.)rI   rF   Z	PieceToIdr(   Zunk_token_id)r-   tokenZspm_idr&   r&   r.   _convert_token_to_id  s    

z$PLBartTokenizer._convert_token_to_idc                 C   s&   || j kr| j | S | j|| j S )z=Converts an index (integer) in a token (str) using the vocab.)rO   rF   Z	IdToPiecer(   )r-   indexr&   r&   r.   _convert_id_to_token  s    

z$PLBartTokenizer._convert_id_to_tokenc                 C   s   d |td }|S )zIConverts a sequence of tokens (strings for sub-words) in a single string.  )joinreplaceSPIECE_UNDERLINEstrip)r-   tokensZ
out_stringr&   r&   r.   convert_tokens_to_string  s    z(PLBartTokenizer.convert_tokens_to_string)save_directoryfilename_prefixrd   c              	   C   s   t j|s"td| d d S t j||r6|d ndtd  }t j| jt j|krzt j	| jrzt
| j| n8t j	| jst|d}| j }|| W 5 Q R X |fS )NzVocabulary path (z) should be a directory-r}   r   wb)ospathisdirloggererrorr   VOCAB_FILES_NAMESabspathr   isfiler   openrF   r]   write)r-   r   r   Zout_vocab_filefiZcontent_spiece_modelr&   r&   r.   save_vocabulary  s     (
zPLBartTokenizer.save_vocabularyr   r   )	src_textsr>   	tgt_textsr?   rd   c                    s*   |  || _|  || _t j||f|S re   )rC   r>   r?   rT   prepare_seq2seq_batch)r-   r   r>   r   r?   rW   rX   r&   r.   r     s    z%PLBartTokenizer.prepare_seq2seq_batchc                 C   s   |  | jS re   )rV   r>   r,   r&   r&   r.   _switch_to_input_mode  s    z%PLBartTokenizer._switch_to_input_modec                 C   s   |  | jS re   )set_tgt_lang_special_tokensr?   r,   r&   r&   r.   _switch_to_target_mode  s    z&PLBartTokenizer._switch_to_target_modec                 C   sP   |  |}|dk	r| j| nd| _g | _| jdk	rB| j| jg| _n
| jg| _dS )z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].NrC   rL   Zcur_lang_coder   Zeos_token_idr   )r-   r>   r&   r&   r.   rV     s    

z+PLBartTokenizer.set_src_lang_special_tokens)langrd   c                 C   sP   |  |}|dk	r| j| nd| _g | _| jdk	rB| j| jg| _n
| jg| _dS )zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].Nr   r-   r   r&   r&   r.   r     s    

z+PLBartTokenizer.set_tgt_lang_special_tokensc                 C   s   |t  krt | n|}|S )z;Convert Language Codes to format tokenizer uses if required)FAIRSEQ_LANGUAGE_CODES_MAPrQ   r   r&   r&   r.   rC     s    z1PLBartTokenizer._convert_lang_code_special_format)r   r   r   r   r   r   r    r   NNNNN)NF)N)N)N)r   Nr   )/__name__
__module____qualname____doc__r   Zvocab_files_names&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_mapZmodel_input_namesr   r   int__annotations__r   r   r   rB   r   rU   r_   rb   propertyrc   r>   setterboolrj   rk   rn   rs   ru   rx   rz   r|   r   r   r   r
   r   r   r   rV   r   rC   __classcell__r&   r&   rX   r.   r   j   s   
=             \

    
   
  
  	   

r   )r   shutilr   typingr   r   r   r   r   ZsentencepiecerD   Ztokenization_utilsr	   r
   r   utilsr   Z
get_loggerr   r   r   r   r   r   rH   r   r   r&   r&   r&   r.   <module>   sh   

*