U
    ,-e['                     @   s   d Z ddlZddlZddlmZmZmZ ddlmZm	Z	 e rFddl
Z
ddlmZ ddlmZ eeZdd	iZdd
diiZd
diZdd ZG dd deZG dd deZdS )z Tokenization classes for CPMAnt.    N)ListOptionalTuple)is_jieba_availablerequires_backends   )PreTrainedTokenizer)logging
vocab_filez	vocab.txtzopenbmb/cpm-ant-10bz>https://huggingface.co/openbmb/cpm-ant-10b/blob/main/vocab.txti   c              	   C   sR   t  }t| ddd}| }W 5 Q R X t|D ]\}}|d}|||< q2|S )z*Loads a vocabulary file into a dictionary.rutf-8encoding
)collectionsOrderedDictopen	readlines	enumeraterstrip)r
   vocabreadertokensindextoken r   o/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/cpmant/tokenization_cpmant.py
load_vocab-   s    

r   c                   @   s   e Zd ZdddZdd ZdS )	WordpieceTokenizer<unk>   c                 C   s   || _ || _|| _d S N)r   	unk_tokenmax_input_chars_per_word)selfr   r"   r#   r   r   r   __init__9   s    zWordpieceTokenizer.__init__c                 C   s   t |}t|| jkr| jgS d}g }|t|k rt|}d }||k rrd||| }|| jkrh|}qr|d8 }q>|d kr|| j |d7 }q&|| |}q&|S )Nr       )listlenr#   r"   joinr   append)r$   r   charsstartZ
sub_tokensendZ
cur_substrsubstrr   r   r   tokenize>   s(    



zWordpieceTokenizer.tokenizeN)r   r    )__name__
__module____qualname__r%   r0   r   r   r   r   r   8   s   
r   c                	       s  e Zd ZdZeZeZeZ	ddgZ
dZd2 fdd	Zedd Zedd Zedd ZeedddZdd Zdd Z fddZdd  Zee ed!d"d#Zd$d% Zd&d' Zd3eee ee d)d*d+Zd4ee ee ee d,d-d.Zd5ee eee  e ee d/ fd0d1Z!  Z"S )6CpmAntTokenizera  
    Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bod_token (`str`, *optional*, defaults to `"<d>"`):
            The beginning of document token.
        eod_token (`str`, *optional*, defaults to `"</d>"`):
            The end of document token.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        line_token (`str`, *optional*, defaults to `"</n>"`):
            The line token.
        space_token (`str`, *optional*, defaults to `"</_>"`):
            The space token.
    Z	input_idsZattention_maskF<d></d><s></s><pad>r   </n></_>leftc                    s   t | dg || _|| _t|| _| j|	 | jd< | j| | jd< | j|	= | j|= tt| j dd d| _dd | j D | _	t
| j|d	| _t jf ||||||||	|
d
	| d S )Njieba r   c                 S   s   | d S Nr'   r   xr   r   r   <lambda>       z*CpmAntTokenizer.__init__.<locals>.<lambda>keyc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>   s      z,CpmAntTokenizer.__init__.<locals>.<dictcomp>)r   r"   )		bod_token	eod_token	bos_token	eos_token	pad_tokenr"   
line_tokenspace_tokenpadding_side)r   rJ   rK   r   encoderr   r   sorteditemsdecoderr   wordpiece_tokenizersuperr%   )r$   r
   rJ   rK   rL   rM   rN   r"   rO   rP   rQ   kwargs	__class__r   r   r%   w   s0    

zCpmAntTokenizer.__init__c                 C   s   | j | j S r!   )rR   rJ   r$   r   r   r   bod_token_id   s    zCpmAntTokenizer.bod_token_idc                 C   s   | j | j S r!   )rR   rK   r[   r   r   r   eod_token_id   s    zCpmAntTokenizer.eod_token_idc                 C   s
   | j d S )Nr   rR   r[   r   r   r   
newline_id   s    zCpmAntTokenizer.newline_id)returnc                 C   s
   t | jS r!   )r)   rR   r[   r   r   r   
vocab_size   s    zCpmAntTokenizer.vocab_sizec                 C   s   t | jf| jS r!   )dictrR   Zadded_tokens_encoderr[   r   r   r   	get_vocab   s    zCpmAntTokenizer.get_vocabc                 C   s.   g }t j|ddD ]}|| j| q|S )zTokenize a string.F)Zcut_all)r=   cutextendrV   r0   )r$   textZoutput_tokensrA   r   r   r   	_tokenize   s    zCpmAntTokenizer._tokenizec                    s0   dd |D } fdd|D }t  j|f|S )zDecode ids into a string.c                 S   s   g | ]}|d kr|qS )r   r   )rF   ir   r   r   
<listcomp>   s      z+CpmAntTokenizer._decode.<locals>.<listcomp>c                    s.   g | ]&}| j kr| jkr| jkr|qS r   )Zpad_token_idZeos_token_idbos_token_id)rF   rA   r[   r   r   ri      s
    
 
 
 )rW   _decode)r$   Z	token_idsrX   rY   r[   r   rk      s
    
zCpmAntTokenizer._decodec                 C   s
   || j kS r!   r^   r$   r   r   r   r   check   s    zCpmAntTokenizer.check)r   r`   c                 C   s
   d |S )Nr&   )r*   )r$   r   r   r   r   convert_tokens_to_string   s    z(CpmAntTokenizer.convert_tokens_to_stringc                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)rR   getr"   rl   r   r   r   _convert_token_to_id   s    z$CpmAntTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)rU   ro   r"   )r$   r   r   r   r   _convert_id_to_token   s    z$CpmAntTokenizer._convert_id_to_tokenN)save_directoryfilename_prefixr`   c              	   C   s  t j|r0t j||r |d ndtd  }n|r<|d nd| }d}d| jkrj| jd | jd< | jd= d| jkr| jd | jd< | jd= tt| j	 d	d
 d| _t
|dddN}| j	 D ]<\}}||krtd| d |}||d  |d7 }qW 5 Q R X |fS )N-r&   r
   r   r>   r;   r   r:   c                 S   s   | d S r?   r   r@   r   r   r   rB      rC   z1CpmAntTokenizer.save_vocabulary.<locals>.<lambda>rD   wr   r   zSaving vocabulary to z\: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!r'   )ospathisdirr*   VOCAB_FILES_NAMESrR   r   r   rS   rT   r   loggerwarningwrite)r$   rr   rs   r
   r   writerr   Ztoken_indexr   r   r   save_vocabulary   s0     


zCpmAntTokenizer.save_vocabulary)token_ids_0token_ids_1r`   c                 C   s,   |dkr| j g| S | j g| | j g | S )a1  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CPMAnt sequence has the following format:

        - single sequence: `[BOS] Sequence`.

        Args:
            token_ids_0 (`List[int]`): The first tokenized sequence that special tokens will be added.
            token_ids_1 (`List[int]`): The optional second tokenized sequence that special tokens will be added.

        Returns:
            `List[int]`: The model input with special tokens.
        N)rj   )r$   r   r   r   r   r    build_inputs_with_special_tokens   s    z0CpmAntTokenizer.build_inputs_with_special_tokens)r   r   already_has_special_tokensr`   c                    sZ   |rt  j||ddS |dk	rFdgdgt|  dg dgt|  S dgdgt|  S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`): List of IDs.
            token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   Nr'   r   )rW   get_special_tokens_maskr)   )r$   r   r   r   rY   r   r   r      s      (z'CpmAntTokenizer.get_special_tokens_mask)	r5   r6   r7   r8   r9   r   r:   r;   r<   )N)N)NF)#r1   r2   r3   __doc__ry   Zvocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesZmodel_input_namesZadd_prefix_spacer%   propertyr\   r]   r_   intra   rc   rg   rk   rm   r   strrn   rp   rq   r   r   r~   r   boolr   __classcell__r   r   rY   r   r4   X   sR            *


     
 r4   )r   r   rv   typingr   r   r   Ztransformers.utilsr   r   r=   Ztokenization_utilsr   utilsr	   Z
get_loggerr1   rz   ry   r   r   r   objectr   r4   r   r   r   r   <module>   s*   
   