U
    ,-e$R                     @   s   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
ZddlmZmZmZmZmZ ddlmZ eeZdd	d
Zddiddid
ZddiZdi iZe dd Zdd Zdd Zdd ZG dd deZ G dd deZ!dS )zTokenization classes for CLIP.    N)	lru_cache)ListOptionalTuple   )
AddedTokenPreTrainedTokenizer_is_control_is_punctuation_is_whitespace)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filezopenai/clip-vit-base-patch32zKhttps://huggingface.co/openai/clip-vit-base-patch32/resolve/main/vocab.jsonzKhttps://huggingface.co/openai/clip-vit-base-patch32/resolve/main/merges.txtM   c                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ],}|| krf| | |d
|  |d7 }qfdd |D }tt| |S )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS  )chr).0nr   r   k/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/clip/tokenization_clip.py
<listcomp>L   s     z$bytes_to_unicode.<locals>.<listcomp>)listrangeordappenddictzip)bscsr   br   r   r   bytes_to_unicode7   s    L

r'   c                 C   s6   t  }| d }| dd D ]}|||f |}q|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairsZ	prev_charcharr   r   r   	get_pairsP   s    r-   c                 C   s   t dd| } |  } | S )Nz\s+ )resubstrip)textr   r   r   whitespace_clean^   s    r3   c                 C   s   |   } | sg S |  }|S )z@Runs basic whitespace cleaning and splitting on a piece of text.)r1   split)r2   tokensr   r   r   whitespace_tokenizee   s
    r6   c                   @   sN   e Zd ZdZdddZdddZdd	 Zdd
dZdd Zdd Z	dd Z
dS )BasicTokenizera  
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    TNc                 C   s2   |d krg }|| _ t|| _|| _|| _|| _d S N)do_lower_caser(   never_splittokenize_chinese_charsstrip_accentsdo_split_on_punc)selfr9   r:   r;   r<   r=   r   r   r   __init__   s    
zBasicTokenizer.__init__c                 C   s   |r| j t|n| j }| |}| jr4| |}td|}t|}g }|D ]R}||kr| j	r|
 }| jdk	r| |}n| jr| |}|| || qPtd|}|S )aj  
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        NFCFr.   )r:   unionr(   _clean_textr;   _tokenize_chinese_charsunicodedata	normalizer6   r9   lowerr<   _run_strip_accentsextend_run_split_on_puncjoin)r>   r2   r:   Zunicode_normalized_textZorig_tokensZsplit_tokenstokenZoutput_tokensr   r   r   tokenize   s$    




zBasicTokenizer.tokenizec                 C   sB   t d|}g }|D ]"}t |}|dkr,q|| qd|S )z$Strips accents from a piece of text.ZNFDZMn )rD   rE   categoryr!   rJ   )r>   r2   outputr,   catr   r   r   rG      s    
z!BasicTokenizer._run_strip_accentsc                 C   s   | j r|dk	r||kr|gS t|}d}d}g }|t|k r|| }t|r^||g d}n |rl|g  d}|d | |d7 }q0dd |D S )	z&Splits punctuation on a piece of text.Nr   TFr   c                 S   s   g | ]}d  |qS )rM   )rJ   )r   xr   r   r   r      s     z5BasicTokenizer._run_split_on_punc.<locals>.<listcomp>)r=   r   lenr
   r!   )r>   r2   r:   charsiZstart_new_wordrO   r,   r   r   r   rI      s"    

z!BasicTokenizer._run_split_on_puncc                 C   sT   g }|D ]@}t |}| |r>|d || |d q|| qd|S )z)Adds whitespace around any CJK character.r.   rM   )r    _is_chinese_charr!   rJ   r>   r2   rO   r,   cpr   r   r   rC      s    


z&BasicTokenizer._tokenize_chinese_charsc                 C   s   |dkr|dks|dkr |dks|dkr0|dks|dkr@|dks|d	krP|d
ks|dkr`|dks|dkrp|dks|dkr|dkrdS dS )z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TFr   )r>   rX   r   r   r   rV      sD    
zBasicTokenizer._is_chinese_charc                 C   sX   g }|D ]D}t |}|dks|dkst|r.qt|rB|d q|| qd|S )zBPerforms invalid character removal and whitespace cleanup on text.r   i  r.   rM   )r    r	   r   r!   rJ   rW   r   r   r   rB     s    zBasicTokenizer._clean_text)TNTNT)N)N)__name__
__module____qualname____doc__r?   rL   rG   rI   rC   rV   rB   r   r   r   r   r7   o   s        

&
r7   c                       s   e Zd ZdZeZeZeZ	ddgZ
d$ fdd	Zed	d
 Zdd Zd%ee eee  ee dddZd&ee eee  eee d fddZd'ee eee  ee dddZdd Zdd Zdd Zdd Zdd  Zd(eee ee d!d"d#Z  ZS ))CLIPTokenizera<  
    Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        errors (`str`, *optional*, defaults to `"replace"`):
            Paradigm to follow when decoding bytes to UTF-8. See
            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The end of sequence token.
    Z	input_idsZattention_maskreplace<|endoftext|><|startoftext|>c              	      s  t |trt|dddn|}t |tr4t|dddn|}t |trPt|dddn|}zdd l}	|	j| _W n2 tk
r   td tddd| _	d | _Y nX t
|dd}
t|
| _W 5 Q R X dd	 | j D | _|| _t | _d
d	 | j D | _t
|dd }|  ddd }W 5 Q R X dd |D }tt|tt|| _ddd| _tdtj| _ t! j"f |||||d| d S )NF)lstriprstripr   zKftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.)r<   r=   utf-8encodingc                 S   s   i | ]\}}||qS r   r   r   kvr   r   r   
<dictcomp>F  s      z*CLIPTokenizer.__init__.<locals>.<dictcomp>c                 S   s   i | ]\}}||qS r   r   rf   r   r   r   ri   I  s      
r   i  c                 S   s   g | ]}t | qS r   )tupler4   )r   merger   r   r   r   L  s     z*CLIPTokenizer.__init__.<locals>.<listcomp>r`   r_   )r`   r_   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)errors	unk_token	bos_token	eos_token	pad_token)#
isinstancestrr   ftfyfix_textImportErrorloggerinfor7   nlpopenjsonloadencoderitemsdecoderrm   r'   byte_encoderbyte_decoderreadr1   r4   r"   r#   r   rS   	bpe_rankscacher/   compile
IGNORECASEpatsuperr?   )r>   r   r   rm   rn   ro   rp   rq   kwargsrt   Zvocab_handleZmerges_handleZ
bpe_merges	__class__r   r   r?   -  sD    
$zCLIPTokenizer.__init__c                 C   s
   t | jS r8   )rS   r}   r>   r   r   r   
vocab_size^  s    zCLIPTokenizer.vocab_sizec                 C   s   t | jf| jS r8   )r"   r}   Zadded_tokens_encoderr   r   r   r   	get_vocabb  s    zCLIPTokenizer.get_vocabN)token_ids_0token_ids_1returnc                 C   s<   | j g}| jg}|dkr$|| | S || | | | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CLIP sequence has the following format:

        - single sequence: `<|startoftext|> X <|endoftext|>`

        Pairs of sequences are not the expected use case, but they will be handled without a separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)bos_token_ideos_token_idr>   r   r   ro   rp   r   r   r    build_inputs_with_special_tokense  s
    z.CLIPTokenizer.build_inputs_with_special_tokensF)r   r   already_has_special_tokensr   c                    sl   |rt  j||ddS |dkr8dgdgt|  dg S dgdgt|  dg dg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   Nr   r   )r   get_special_tokens_maskrS   )r>   r   r   r   r   r   r   r     s      z%CLIPTokenizer.get_special_tokens_maskc                 C   sP   | j g}| jg}|dkr.t|| | dg S t|| | | | | dg S )a  
        Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of
        zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )r   r   rS   r   r   r   r   $create_token_type_ids_from_sequences  s
    z2CLIPTokenizer.create_token_type_ids_from_sequencesc           
         s~  | j kr j | S t|d d |d d f }t|}|sF|d S t| fddd}| jkrhqf|\}}g }d}|t|k r<z|||}	W n, tk
r   |||d   Y q<Y nX ||||	  |	}|| |kr$|t|d k r$||d  |kr$|	||  |d7 }qx|	||  |d7 }qxt|}|}t|dkr\qfqFt|}qFd	
|}| j |< |S )
NrQ   </w>c                    s    j | tdS )Ninf)r   getfloat)pairr   r   r   <lambda>      z#CLIPTokenizer.bpe.<locals>.<lambda>keyr   r      r.   )r   rk   r-   minr   rS   index
ValueErrorrH   r!   rJ   )
r>   rK   r*   r+   ZbigramfirstsecondZnew_wordrU   jr   r   r   bpe  sB    


2




zCLIPTokenizer.bpec                    s   g } j dkr"d j|}nt  | }t j|D ]B}d fdd|	dD }|
dd  |dD  qB|S )zTokenize a string.Nr.   rM   c                 3   s   | ]} j | V  qd S r8   )r   )r   r&   r   r   r   	<genexpr>  s    z*CLIPTokenizer._tokenize.<locals>.<genexpr>rc   c                 s   s   | ]
}|V  qd S r8   r   )r   Z	bpe_tokenr   r   r   r     s     )ru   rJ   ry   rL   r3   rF   r/   findallr   encoderH   r   r4   )r>   r2   
bpe_tokensrK   r   r   r   	_tokenize  s    
"zCLIPTokenizer._tokenizec                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)r}   r   rn   )r>   rK   r   r   r   _convert_token_to_id  s    z"CLIPTokenizer._convert_token_to_idc                 C   s   | j |S )z=Converts an index (integer) in a token (str) using the vocab.)r   r   )r>   r   r   r   r   _convert_id_to_token  s    z"CLIPTokenizer._convert_id_to_tokenc                    s@   d |}t fdd|D }|jd jddd }|S )z:Converts a sequence of tokens (string) in a single string.rM   c                    s   g | ]} j | qS r   )r   )r   cr   r   r   r     s     z:CLIPTokenizer.convert_tokens_to_string.<locals>.<listcomp>rc   )rm   r   r.   )rJ   	bytearraydecoderm   r^   r1   )r>   r5   r2   Z
byte_arrayr   r   r   convert_tokens_to_string  s    
z&CLIPTokenizer.convert_tokens_to_string)save_directoryfilename_prefixr   c           
   	   C   s$  t j|s td| d S t j||r4|d ndtd  }t j||rV|d ndtd  }t|ddd$}|	t
j| jd	d
ddd  W 5 Q R X d}t|dddh}|	d t| j dd dD ]@\}}	||	krtd| |	}|	d|d  |d7 }qW 5 Q R X ||fS )Nz*Vocabulary path ({}) should be a directory-rM   r   r   wrc   rd   r   TF)indent	sort_keysensure_asciirj   r   z#version: 0.2
c                 S   s   | d S )Nr   r   )kvr   r   r   r     r   z/CLIPTokenizer.save_vocabulary.<locals>.<lambda>r   zqSaving vocabulary to {}: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r.   r   )ospathisdirrw   errorformatrJ   VOCAB_FILES_NAMESrz   writer{   dumpsr}   sortedr   r~   warning)
r>   r   r   r   Z
merge_filefr   writerr   Ztoken_indexr   r   r   save_vocabulary  s6      (
zCLIPTokenizer.save_vocabulary)r^   r_   r`   r_   r_   )N)NF)N)N)rY   rZ   r[   r\   r   Zvocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesZmodel_input_namesr?   propertyr   r   r   intr   r   boolr   r   r   r   r   r   r   rs   r   r   __classcell__r   r   r   r   r]     sP        1
  
    
   
*r]   )"r\   r{   r   rD   	functoolsr   typingr   r   r   regexr/   Ztokenization_utilsr   r   r	   r
   r   utilsr   Z
get_loggerrY   rw   r   r   r   ZPRETRAINED_INIT_CONFIGURATIONr'   r-   r3   r6   objectr7   r]   r   r   r   r   <module>   sB   
  
  

 "