U
    ,-ek9                     @   s   d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z
ddlmZmZmZ ddlmZ eeZdZdd	iZdd
ddiZdddZddddddddddddddddddd d!d"d#d$d%d&gZG d'd( d(eZdS ))    N)copyfile)AnyDictListOptionalTuple   )
AddedTokenBatchEncodingPreTrainedTokenizer)loggingu   ▁
vocab_filezsentencepiece.bpe.modelzVhttps://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentencepiece.bpe.modelzUhttps://huggingface.co/facebook/mbart-large-cc25/resolve/main/sentencepiece.bpe.model)zfacebook/mbart-large-en-rozfacebook/mbart-large-cc25i   Zar_ARcs_CZde_DEen_XXZes_XXet_EEfi_FIZfr_XXgu_INhi_INit_ITZja_XXkk_KZko_KRlt_LTlv_LVZmy_MMne_NPZnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CNc                       s  e Zd ZU dZeZeZeZ	ddgZ
g Zee ed< g Zee ed< dBeeeef  d fddZdd Zdd Zedd ZeedddZejeddddZdCee eee  eee d fddZdDee eee  ee ddd ZdEee eee  ee dd!d"Zeee ee d#d$d%Zd&d' Z eee d(d)d*Z!d+d, Z"d-d. Z#d/d0 Z$dFeee e%e d1d2d3Z&dGee eeee  ee'd6 fd7d8Z(d9d: Z)d;d< Z*ddd=d>Z+edd?d@dAZ,  Z-S )HMBartTokenizeruT  
    Construct an MBART tokenizer.

    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Examples:

    ```python
    >>> from transformers import MBartTokenizer

    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
    ```Z	input_idsZattention_maskprefix_tokenssuffix_tokens<s></s><unk><pad><mask>N)sp_model_kwargsc                    s~  t |trt|dddn|}|d kr(i n|_tjf j_jt| |_ddddd_	d_
tj_fd	d
ttD _dd
 j D _tjtj j
 j	d< j	j dd
 j	 D _tj  |d k	r  fdd|D  t jf |||||||d |
| jd| |
d k	rT|
nd_jj _|_j d S )NTF)lstriprstripr         r   )r$   r'   r%   r&   c                    s"   i | ]\}}| j |  j qS  )sp_model_sizefairseq_offset).0icodeselfr.   m/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/mbart/tokenization_mbart.py
<dictcomp>y   s     z+MBartTokenizer.__init__.<locals>.<dictcomp>c                 S   s   i | ]\}}||qS r.   r.   r1   kvr.   r.   r6   r7   |   s      r(   c                 S   s   i | ]\}}||qS r.   r.   r8   r.   r.   r6   r7      s      c                    s   g | ]}| kr|qS r.   r.   )r1   t)_additional_special_tokensr.   r6   
<listcomp>   s      z+MBartTokenizer.__init__.<locals>.<listcomp>)	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_tokentokenizer_filesrc_langtgt_langadditional_special_tokensr)   r   )
isinstancestrr	   r)   spmSentencePieceProcessorsp_modelLoadr   fairseq_tokens_to_idsr0   lenr/   	enumerateFAIRSEQ_LANGUAGE_CODESlang_code_to_iditemsZid_to_lang_codeupdatefairseq_ids_to_tokenslistkeysextendsuper__init__	_src_langZcur_lang_code_idrG   set_src_lang_special_tokens)r5   r   r>   r?   rA   rB   r@   rC   rD   rE   rF   rG   r)   rH   kwargs	__class__)r<   r5   r6   r[   R   sP    	
 
zMBartTokenizer.__init__c                 C   s$   | j  }d |d< | j |d< |S )NrM   sp_model_proto)__dict__copyrM   serialized_model_proto)r5   stater.   r.   r6   __getstate__   s    
zMBartTokenizer.__getstate__c                 C   s8   || _ t| dsi | _tjf | j| _| j| j d S )Nr)   )rb   hasattrr)   rK   rL   rM   ZLoadFromSerializedProtora   )r5   dr.   r.   r6   __setstate__   s
    
zMBartTokenizer.__setstate__c                 C   s   t | jt | j | j d S )Nr,   )rP   rM   rS   r0   r4   r.   r.   r6   
vocab_size   s    zMBartTokenizer.vocab_size)returnc                 C   s   | j S N)r\   r4   r.   r.   r6   rF      s    zMBartTokenizer.src_lang)new_src_langrk   c                 C   s   || _ | | j  d S rl   )r\   r]   )r5   rm   r.   r.   r6   rF      s    F)token_ids_0token_ids_1already_has_special_tokensrk   c                    sx   |rt  j||ddS dgt| j }dgt| j }|dkrT|dgt|  | S |dgt|  dgt|  | S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rn   ro   rp   r,   Nr   )rZ   get_special_tokens_maskrP   r"   r#   )r5   rn   ro   rp   Zprefix_onesZsuffix_onesr_   r.   r6   rq      s      z&MBartTokenizer.get_special_tokens_mask)rn   ro   rk   c                 C   s,   |dkr| j | | j S | j | | | j S )ab  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An MBART sequence has the following format, where `X` represents the sequence:

        - `input_ids` (for encoder) `X [eos, src_lang_code]`
        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)r"   r#   )r5   rn   ro   r.   r.   r6    build_inputs_with_special_tokens   s    z/MBartTokenizer.build_inputs_with_special_tokensc                 C   sP   | j g}| jg}|dkr.t|| | dg S t|| | | | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. mBART does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.

        Nr   )Zsep_token_idZcls_token_idrP   )r5   rn   ro   sepclsr.   r.   r6   $create_token_type_ids_from_sequences   s
    z3MBartTokenizer.create_token_type_ids_from_sequences)return_tensorsrF   rG   c                 K   sJ   |dks|dkrt d|| _| |fd|d|}| |}||d< |S )zIUsed by translation pipeline, to prepare inputs for the generate functionNzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)Zadd_special_tokensrv   Zforced_bos_token_id)
ValueErrorrF   Zconvert_tokens_to_ids)r5   Z
raw_inputsrv   rF   rG   extra_kwargsinputsZtgt_lang_idr.   r.   r6   _build_translation_inputs  s    
z(MBartTokenizer._build_translation_inputsc                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r.   )Zconvert_ids_to_tokens)r1   r2   r4   r.   r6   r7     s      z,MBartTokenizer.get_vocab.<locals>.<dictcomp>)rangerj   rU   Zadded_tokens_encoder)r5   Zvocabr.   r4   r6   	get_vocab  s    zMBartTokenizer.get_vocab)textrk   c                 C   s   | j j|tdS )N)Zout_type)rM   encoderJ   )r5   r}   r.   r.   r6   	_tokenize  s    zMBartTokenizer._tokenizec                 C   s4   || j kr| j | S | j|}|r.|| j S | jS )z0Converts a token (str) in an id using the vocab.)rO   rM   Z	PieceToIdr0   Zunk_token_id)r5   tokenZspm_idr.   r.   r6   _convert_token_to_id!  s    

z#MBartTokenizer._convert_token_to_idc                 C   s&   || j kr| j | S | j|| j S )z=Converts an index (integer) in a token (str) using the vocab.)rV   rM   Z	IdToPiecer0   )r5   indexr.   r.   r6   _convert_id_to_token*  s    

z#MBartTokenizer._convert_id_to_tokenc                 C   s   d |td }|S )zIConverts a sequence of tokens (strings for sub-words) in a single string.  )joinreplaceSPIECE_UNDERLINEstrip)r5   tokensZ
out_stringr.   r.   r6   convert_tokens_to_string0  s    z'MBartTokenizer.convert_tokens_to_string)save_directoryfilename_prefixrk   c              	   C   s   t j|s"td| d d S t j||r6|d ndtd  }t j| jt j|krzt j	| jrzt
| j| n8t j	| jst|d}| j }|| W 5 Q R X |fS )NzVocabulary path (z) should be a directory-r   r   wb)ospathisdirloggererrorr   VOCAB_FILES_NAMESabspathr   isfiler   openrM   rd   write)r5   r   r   Zout_vocab_filefiZcontent_spiece_modelr.   r.   r6   save_vocabulary5  s     (
zMBartTokenizer.save_vocabularyr   r   )	src_textsrF   	tgt_textsrG   rk   c                    s   || _ || _t j||f|S rl   )rF   rG   rZ   prepare_seq2seq_batch)r5   r   rF   r   rG   r^   r_   r.   r6   r   F  s    z$MBartTokenizer.prepare_seq2seq_batchc                 C   s   |  | jS rl   )r]   rF   r4   r.   r.   r6   _switch_to_input_modeR  s    z$MBartTokenizer._switch_to_input_modec                 C   s   |  | jS rl   )set_tgt_lang_special_tokensrG   r4   r.   r.   r6   _switch_to_target_modeU  s    z%MBartTokenizer._switch_to_target_modec                 C   s$   | j | | _g | _| j| jg| _dS )z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].NrS   Zcur_lang_coder"   Zeos_token_idr#   )r5   rF   r.   r.   r6   r]   X  s    z*MBartTokenizer.set_src_lang_special_tokens)langrk   c                 C   s$   | j | | _g | _| j| jg| _dS )zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].Nr   )r5   r   r.   r.   r6   r   ^  s    z*MBartTokenizer.set_tgt_lang_special_tokens)r$   r%   r%   r$   r&   r'   r(   NNNNN)NF)N)N)N)r   Nr   ).__name__
__module____qualname____doc__r   Zvocab_files_names&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_mapZmodel_input_namesr"   r   int__annotations__r#   r   r   rJ   r   r[   rf   ri   propertyrj   rF   setterboolrq   rr   ru   rz   r|   r   r   r   r   r   r   r
   r   r   r   r]   r   __classcell__r.   r.   r_   r6   r!   5   s   
            L

    
   
  
  	   
r!   )r   shutilr   typingr   r   r   r   r   ZsentencepiecerK   Ztokenization_utilsr	   r
   r   utilsr   Z
get_loggerr   r   r   r   r   r   rR   r!   r.   r.   r.   r6   <module>   s$   
6