U
    9%eN                     @   s   d Z ddlZddlZddlZddlZddlmZmZmZm	Z	 ddl
mZ ddlmZ eeZddd	d
Zddiddiddid
ZddiZdddgddddiZdd Zdd Zdd ZG dd deZdS )zTokenization classes for FSMT.    N)DictListOptionalTuple   )PreTrainedTokenizer)loggingzvocab-src.jsonzvocab-tgt.jsonz
merges.txt)src_vocab_filetgt_vocab_filemerges_filezstas/tiny-wmt19-en-dezHhttps://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/vocab-src.jsonzHhttps://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/vocab-tgt.jsonzDhttps://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/merges.txti   ende)langsZmodel_max_lengthZspecial_tokens_map_fileZfull_tokenizer_filec                 C   s6   t  }| d }| dd D ]}|||f |}q|S )z
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    r      N)setadd)wordpairsZ	prev_charchar r   i/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/fsmt/tokenization_fsmt.py	get_pairs9   s    r   c                 C   s  |  dd} tdd| } |  dd} |  dd} |  dd} |  d	d
} |  dd
} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  d d!} |  d"d#} |  d$d%} |  d&d'} |  d(d)} |  d*d+} |  d,d-} td.d| } |  d/d0} |  d1d2} |  d3d4} |  d5d6} |  d7d8} |  d9d:} |  d;d<} |  d=d>} |  d?d@} | S )Azz
    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
    u   ，,u   。\s*z. u   、u   ”"u   “u   ∶:u   ：u   ？?u   《u   》u   ）)u   ！!u   （(u   ；;u   １1u   」u   「u   ０0u   ３3u   ２2u   ５5u   ６6u   ９9u   ７7u   ８8u   ４4u   ．\s*u   ～~u   ’'u   …z...u   ━-u   〈<u   〉>u   【[u   】]u   ％%)replaceresub)textr   r   r   replace_unicode_punctF   sJ    r6   c                 C   s8   g }| D ]$}t |}|dr"q|| qd|S )zw
    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
    C )unicodedatacategory
startswithappendjoin)r5   outputr   catr   r   r   remove_non_printing_charq   s    

r@   c                	       sr  e Zd ZdZeZeZeZ	e
ZddgZd; fd
d	Zeeef dddZeedddZdd Zdd Zdd Zdd Zedd Zedd Zdd Zdd  Zd!d" Zd<d$d%Zd&d' Zd(d) Zd*d+ Z d=e!e e"e!e  e!e d,d-d.Z#d>e!e e"e!e  e$e!e d/ fd0d1Z%d?e!e e"e!e  e!e d,d2d3Z&d@ee"e e'e d4d5d6Z(d7d8 Z)d9d: Z*  Z+S )AFSMTTokenizera  
    Construct an FAIRSEQ Transformer tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:

    - Moses preprocessing and tokenization.
    - Normalizing all inputs text.
    - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like
      "__classify__") to a vocabulary.
    - The argument `langs` defines a pair of languages.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        langs (`List[str]`):
            A list of two languages to translate from and to, for instance `["en", "ru"]`.
        src_vocab_file (`str`):
            File containing the vocabulary for the source language.
        tgt_vocab_file (`st`):
            File containing the vocabulary for the target language.
        merges_file (`str`):
            File containing the merges.
        do_lower_case (`bool`, *optional*, defaults to `False`):
            Whether or not to lowercase the input when tokenizing.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.

    Z	input_idsZattention_maskNF<unk><s></s><pad>c
                    sf  zdd l }W n tk
r(   tdY nX || _|| _|| _|| _|| _i | _i | _i | _	|rxt
|dkrx|\| _| _ntd| dt|dd}t|| _W 5 Q R X t|dd$}t|}dd	 | D | _W 5 Q R X t|dd}| d
d d }W 5 Q R X dd |D }tt|tt
|| _i | _t jf |||||||||	d	|
 d S )Nr   nYou need to install sacremoses to use XLMTokenizer. See https://pypi.org/project/sacremoses/ for installation.   zFarg `langs` needs to be a list of 2 langs, e.g. ['en', 'ru'], but got z. Usually that means that tokenizer can't find a mapping for the given model path in PRETRAINED_VOCAB_FILES_MAP, and other maps of this tokenizer.utf-8encodingc                 S   s   i | ]\}}||qS r   r   .0kvr   r   r   
<dictcomp>   s      z*FSMTTokenizer.__init__.<locals>.<dictcomp>
c                 S   s    g | ]}t | d d qS )NrG   )tuplesplit)rL   merger   r   r   
<listcomp>   s     z*FSMTTokenizer.__init__.<locals>.<listcomp>)	r   r	   r
   r   do_lower_case	unk_token	bos_token	sep_token	pad_token)
sacremosesImportErrorsmr	   r
   r   rV   cache_moses_punct_normalizercache_moses_tokenizercache_moses_detokenizerlensrc_langtgt_lang
ValueErroropenjsonloadencoderitemsdecoderreadrS   dictziprange	bpe_rankscachesuper__init__)selfr   r	   r
   r   rV   rW   rX   rY   rZ   kwargsr[   Zsrc_vocab_handleZtgt_vocab_handle	tgt_vocabZmerges_handleZmerges	__class__r   r   rr      sT    


 
zFSMTTokenizer.__init__)returnc                 C   s   |   S N)get_src_vocabrs   r   r   r   	get_vocab   s    zFSMTTokenizer.get_vocabc                 C   s   | j S ry   )src_vocab_sizer{   r   r   r   
vocab_size  s    zFSMTTokenizer.vocab_sizec                 C   s2   || j kr"| jj|d}|| j |< | j | |S Nlang)r^   r]   ZMosesPunctNormalizer	normalize)rs   r5   r   Zpunct_normalizerr   r   r   moses_punct_norm  s    

zFSMTTokenizer.moses_punct_normc                 C   s:   || j kr"| jj|d}|| j |< | j | j|ddddS )Nr   TF)Zaggressive_dash_splitsZ
return_strescape)r_   r]   ZMosesTokenizertokenize)rs   r5   r   Zmoses_tokenizerr   r   r   moses_tokenize  s    


   zFSMTTokenizer.moses_tokenizec                 C   s2   || j kr"| jj|d}|| j |< | j | |S r   )r`   r]   ZMosesDetokenizerZ
detokenize)rs   tokensr   Zmoses_detokenizerr   r   r   moses_detokenize  s    

zFSMTTokenizer.moses_detokenizec                 C   s    t |}| ||}t|}|S ry   )r6   r   r@   )rs   r5   r   r   r   r   moses_pipeline  s    zFSMTTokenizer.moses_pipelinec                 C   s
   t | jS ry   )ra   rh   r{   r   r   r   r}      s    zFSMTTokenizer.src_vocab_sizec                 C   s
   t | jS ry   )ra   rj   r{   r   r   r   tgt_vocab_size$  s    zFSMTTokenizer.tgt_vocab_sizec                 C   s   t | jf| jS ry   )rl   rh   Zadded_tokens_encoderr{   r   r   r   rz   (  s    zFSMTTokenizer.get_src_vocabc                 C   s   t | jf| jS ry   )rl   rj   Zadded_tokens_decoderr{   r   r   r   get_tgt_vocab+  s    zFSMTTokenizer.get_tgt_vocabc           
         s  t |d d |d d f }| jkr2 j| S t|}|sF|d S t| fddd}| jkrhqf|\}}g }d}|t|k r<z|||}	W n, tk
r   |||d   Y q<Y nX ||||	  |	}|| |kr$|t|d k r$||d  |kr$|	||  |d7 }qx|	||  |d7 }qxt |}|}t|dkr\qfqFt|}qFd	
|}|d
kr~d}| j|< |S )NrQ   </w>c                    s    j | tdS )Ninf)ro   getfloat)pairr{   r   r   <lambda>8      z#FSMTTokenizer.bpe.<locals>.<lambda>keyr   r   rG    z
  </w>z
</w>)rR   rp   r   minro   ra   indexrd   extendr<   r=   )
rs   tokenr   r   ZbigramfirstsecondZnew_wordijr   r{   r   bpe.  sF    


2





zFSMTTokenizer.bper   c                 C   sn   | j }| jr| }|r"| }n| j||d}| j||d}g }|D ]"}|rF|t| |d qF|S )av  
        Tokenize a string given language code using Moses.

        Details of tokenization:

            - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
            - Install with `pip install sacremoses`

        Args:
            - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported
              languages. However, we don't enforce it.
            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
              (bool). If True, we only apply BPE.

        Returns:
            List of tokens.
        r   r   )	rb   rV   lowerrS   r   r   r   listr   )rs   r5   r   Zbypass_tokenizerZsplit_tokensr   r   r   r   	_tokenizeZ  s    
zFSMTTokenizer._tokenizec                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)rh   r   rW   )rs   r   r   r   r   _convert_token_to_id  s    z"FSMTTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)rj   r   rW   )rs   r   r   r   r   _convert_id_to_token  s    z"FSMTTokenizer._convert_id_to_tokenc                 C   s.   dd |D }d | }| || j}|S )z:Converts a sequence of tokens (string) in a single string.c                 S   s    g | ]}| d d dd qS )r   r8   r   )r2   )rL   tr   r   r   rU     s     z:FSMTTokenizer.convert_tokens_to_string.<locals>.<listcomp>r8   )r=   rS   r   rc   )rs   r   r5   r   r   r   convert_tokens_to_string  s    z&FSMTTokenizer.convert_tokens_to_string)token_ids_0token_ids_1rx   c                 C   s(   | j g}|dkr|| S || | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A FAIRSEQ Transformer sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)sep_token_idrs   r   r   sepr   r   r    build_inputs_with_special_tokens  s    z.FSMTTokenizer.build_inputs_with_special_tokens)r   r   already_has_special_tokensrx   c                    sZ   |rt  j||ddS |dk	rFdgt| dg dgt|  dg S dgt| dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   Nr   r   )rq   get_special_tokens_maskra   )rs   r   r   r   rv   r   r   r     s      (z%FSMTTokenizer.get_special_tokens_maskc                 C   sF   | j g}|dkr"t|| dg S t|| dg t|| dg  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A FAIRSEQ
        Transformer sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).

        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An
        FAIRSEQ_TRANSFORMER sequence pair mask has the following format:
        Nr   r   )r   ra   r   r   r   r   $create_token_type_ids_from_sequences  s    z2FSMTTokenizer.create_token_type_ids_from_sequences)save_directoryfilename_prefixrx   c              	   C   s  t j|s"td| d d S t j||r6|d ndtd  }t j||rX|d ndtd  }t j||rz|d ndtd  }t|dd	d
$}|t	j
| jddddd  W 5 Q R X t|dd	d
6}dd | j D }|t	j
|ddddd  W 5 Q R X d}t|dd	d
d}	t| j dd dD ]F\}
}||kr\td| d |}|	d|
d  |d7 }q4W 5 Q R X |||fS )NzVocabulary path (z) should be a directoryr,   r8   r	   r
   r   wrH   rI   rG   TF)indent	sort_keysensure_asciirP   c                 S   s   i | ]\}}||qS r   r   rK   r   r   r   rO     s      z1FSMTTokenizer.save_vocabulary.<locals>.<dictcomp>r   c                 S   s   | d S )Nr   r   )kvr   r   r   r     r   z/FSMTTokenizer.save_vocabulary.<locals>.<lambda>r   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r   r   )ospathisdirloggererrorr=   VOCAB_FILES_NAMESre   writerf   dumpsrh   rj   ri   sortedro   warning)rs   r   r   r	   r
   r   fru   r   writerZ
bpe_tokensZtoken_indexr   r   r   save_vocabulary  s>       (&

zFSMTTokenizer.save_vocabularyc                 C   s   | j  }d |d< |S )Nr]   )__dict__copy)rs   stater   r   r   __getstate__  s    
zFSMTTokenizer.__getstate__c                 C   s:   || _ zdd l}W n tk
r.   tdY nX || _d S )Nr   rF   )r   r[   r\   r]   )rs   dr[   r   r   r   __setstate__  s    
zFSMTTokenizer.__setstate__)	NNNNFrB   rC   rD   rE   )r   F)N)NF)N)N),__name__
__module____qualname____doc__r   Zvocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_mapPRETRAINED_INIT_CONFIGURATIONZpretrained_init_configuration&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesZmodel_input_namesrr   r   strintr|   propertyr~   r   r   r   r   r}   r   rz   r   r   r   r   r   r   r   r   r   boolr   r   r   r   r   r   __classcell__r   r   rv   r   rA      sr   -         C

,
'  
    
   
!$rA   )r   rf   r   r3   r9   typingr   r   r   r   Ztokenization_utilsr   utilsr   Z
get_loggerr   r   r   r   r   r   r   r6   r@   rA   r   r   r   r   <module>   s@   
  

+