U
    ,-e<                     @   s   d Z ddlZddlZddlZddlZddlmZmZ ddlm	Z	m
Z
mZmZ ddlmZ eeZddd	Zd
did
did	Zd
diZdd ZG dd deZdd Zdd ZG dd de	ZdS )z$Tokenization classes for OpenAI GPT.    N)OptionalTuple   )PreTrainedTokenizer_is_control_is_punctuation_is_whitespace)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filez
openai-gptz9https://huggingface.co/openai-gpt/resolve/main/vocab.jsonz9https://huggingface.co/openai-gpt/resolve/main/merges.txti   c                 C   s   |   } | sg S |  }|S )z@Runs basic whitespace cleaning and splitting on a piece of text.)stripsplit)texttokens r   o/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/openai/tokenization_openai.pywhitespace_tokenize.   s
    r   c                   @   sN   e Zd ZdZdddZdddZdd	 Zdd
dZdd Zdd Z	dd Z
dS )BasicTokenizera  
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    TNc                 C   s2   |d krg }|| _ t|| _|| _|| _|| _d S N)do_lower_casesetnever_splittokenize_chinese_charsstrip_accentsdo_split_on_punc)selfr   r   r   r   r   r   r   r   __init__O   s    
zBasicTokenizer.__init__c                 C   s   |r| j t|n| j }| |}| jr4| |}td|}t|}g }|D ]R}||kr| j	r|
 }| jdk	r| |}n| jr| |}|| || qPtd|}|S )aj  
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        NFCF )r   unionr   _clean_textr   _tokenize_chinese_charsunicodedata	normalizer   r   lowerr   _run_strip_accentsextend_run_split_on_puncjoin)r   r   r   Zunicode_normalized_textZorig_tokenssplit_tokenstokenZoutput_tokensr   r   r   tokenize_   s$    




zBasicTokenizer.tokenizec                 C   sB   t d|}g }|D ]"}t |}|dkr,q|| qd|S )z$Strips accents from a piece of text.ZNFDZMn )r"   r#   categoryappendr(   )r   r   outputcharcatr   r   r   r%      s    
z!BasicTokenizer._run_strip_accentsc                 C   s   | j r|dk	r||kr|gS t|}d}d}g }|t|k r|| }t|r^||g d}n |rl|g  d}|d | |d7 }q0dd |D S )	z&Splits punctuation on a piece of text.Nr   TF   c                 S   s   g | ]}d  |qS )r,   )r(   ).0xr   r   r   
<listcomp>   s     z5BasicTokenizer._run_split_on_punc.<locals>.<listcomp>)r   listlenr   r.   )r   r   r   charsiZstart_new_wordr/   r0   r   r   r   r'      s"    

z!BasicTokenizer._run_split_on_puncc                 C   sT   g }|D ]@}t |}| |r>|d || |d q|| qd|S )z)Adds whitespace around any CJK character.r   r,   )ord_is_chinese_charr.   r(   r   r   r/   r0   cpr   r   r   r!      s    


z&BasicTokenizer._tokenize_chinese_charsc                 C   s   |dkr|dks|dkr |dks|dkr0|dks|dkr@|dks|d	krP|d
ks|dkr`|dks|dkrp|dks|dkr|dkrdS dS )z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TFr   )r   r>   r   r   r   r<      sD    
zBasicTokenizer._is_chinese_charc                 C   sX   g }|D ]D}t |}|dks|dkst|r.qt|rB|d q|| qd|S )zBPerforms invalid character removal and whitespace cleanup on text.r   i  r   r,   )r;   r   r   r.   r(   r=   r   r   r   r       s    zBasicTokenizer._clean_text)TNTNT)N)N)__name__
__module____qualname____doc__r   r+   r%   r'   r!   r<   r    r   r   r   r   r   8   s        

&
r   c                 C   s6   t  }| d }| dd D ]}|||f |}q|S )z
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    r   r3   N)r   add)wordpairsZ	prev_charr0   r   r   r   	get_pairs   s    rF   c                 C   sn   |  dd} |  dd} |  dd} |  dd} |  dd} td	d
| } tdd| } tdd| } |  S )zm
    fixes some issues the spacy tokenizer had on books corpus also does some whitespace standardization
    u   —-u   –u   ―u   …z...   ´'zD(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)z \1 z\s*\n\s*z 
 z[^\S\n]+r   )replaceresubr   )r   r   r   r   text_standardize   s    rM   c                       s   e Zd ZdZeZeZeZ	ddgZ
d fdd	Zedd Zed	d
 Zdd Zdd Zdd Zdd Zdd Zdd Zdeee ee dddZ  ZS )OpenAIGPTTokenizera(  
    Construct a GPT Tokenizer. Based on Byte-Pair-Encoding with the following peculiarities:

    - lowercases all inputs,
    - uses `SpaCy` tokenizer and `ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
      `BasicTokenizer` if not.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    Z	input_idsZattention_mask<unk>c              	      s  z.dd l }ddlm} | }|j| _|j| _W n0 tk
r^   td t	dd| _d | _Y nX t
|dd}t|| _W 5 Q R X dd	 | j D | _t
|dd}	|	 d
dd }
W 5 Q R X dd |
D }
tt|
tt|
| _i | _t jf d|i| d S )Nr   )EnglishzQftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.T)r   utf-8encodingc                 S   s   i | ]\}}||qS r   r   )r4   kvr   r   r   
<dictcomp>  s      z/OpenAIGPTTokenizer.__init__.<locals>.<dictcomp>
r3   r2   c                 S   s   g | ]}t | qS r   )tupler   )r4   merger   r   r   r6   !  s     z/OpenAIGPTTokenizer.__init__.<locals>.<listcomp>	unk_token)ftfyZspacy.lang.enrP   Z	tokenizernlpfix_textImportErrorloggerwarningr   openjsonloadencoderitemsdecoderreadr   dictzipranger8   	bpe_rankscachesuperr   )r   r
   r   rZ   kwargsr[   rP   Z_nlpZvocab_handleZmerges_handleZmerges	__class__r   r   r     s&    
 zOpenAIGPTTokenizer.__init__c                 C   s   dS )NTr   r   r   r   r   r   '  s    z OpenAIGPTTokenizer.do_lower_casec                 C   s
   t | jS r   )r8   rd   rq   r   r   r   
vocab_size+  s    zOpenAIGPTTokenizer.vocab_sizec                 C   s   t | jf| jS r   )rh   rd   Zadded_tokens_encoderrq   r   r   r   	get_vocab/  s    zOpenAIGPTTokenizer.get_vocabc           
         s  t |d d |d d f }| jkr2 j| S t|}|sF|d S t| fddd}| jkrhqf|\}}g }d}|t|k r<z|||}	W n, tk
r   |||d   Y q<Y nX ||||	  |	}|| |kr$|t|d k r$||d  |kr$|	||  |d7 }qx|	||  |d7 }qxt |}|}t|dkr\qfqFt|}qFd	
|}|d
kr~d}| j|< |S )Nr2   </w>c                    s    j | tdS )Ninf)rk   getfloat)pairrq   r   r   <lambda><      z(OpenAIGPTTokenizer.bpe.<locals>.<lambda>keyr   r3      r   z
  </w>z
</w>)rX   rl   rF   minrk   r8   index
ValueErrorr&   r.   r(   )
r   r*   rD   rE   ZbigramfirstsecondZnew_wordr:   jr   rq   r   bpe2  sF    


2





zOpenAIGPTTokenizer.bpec                 C   s   g }| j dkr@| j|}|D ]}|t| |d qn>| t|  |}|D ]$}|t| |j	 d qX|S )zTokenize a string.Nr   )
r]   r\   r+   r&   r7   r   r   rM   r   r$   )r   r   r)   r*   r   r   r   	_tokenize^  s    
"zOpenAIGPTTokenizer._tokenizec                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)rd   rv   rZ   )r   r*   r   r   r   _convert_token_to_idm  s    z'OpenAIGPTTokenizer._convert_token_to_idc                 C   s   | j || jS )z0Converts an id in a token (BPE) using the vocab.)rf   rv   rZ   )r   r   r   r   r   _convert_id_to_tokenq  s    z'OpenAIGPTTokenizer._convert_id_to_tokenc                 C   s   d |dd }|S )z:Converts a sequence of tokens (string) in a single string.r,   rt   r   )r(   rJ   r   )r   r   Z
out_stringr   r   r   convert_tokens_to_stringu  s    z+OpenAIGPTTokenizer.convert_tokens_to_stringN)save_directoryfilename_prefixreturnc           
   	   C   s(  t j|s"td| d d S t j||r6|d ndtd  }t j||rX|d ndtd  }t|ddd	$}|t	j
| jd
dddd  W 5 Q R X d}t|ddd	j}|d t| j dd dD ]B\}}	||	krtd| d |	}|d|d  |d7 }qW 5 Q R X ||fS )NzVocabulary path (z) should be a directoryrG   r,   r
   r   wrQ   rR   r}   TF)indent	sort_keysensure_asciirW   r   z#version: 0.2
c                 S   s   | d S )Nr3   r   )kvr   r   r   ry     rz   z4OpenAIGPTTokenizer.save_vocabulary.<locals>.<lambda>r{   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r   r3   )ospathisdirr_   errorr(   VOCAB_FILES_NAMESra   writerb   dumpsrd   sortedrk   re   r`   )
r   r   r   r
   Z
merge_filefr   writerZ
bpe_tokensZtoken_indexr   r   r   save_vocabularyz  s2      (

z"OpenAIGPTTokenizer.save_vocabulary)rO   )N)r?   r@   rA   rB   r   Zvocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesZmodel_input_namesr   propertyr   rr   rs   r   r   r   r   r   strr   r   r   __classcell__r   r   ro   r   rN      s"   

,rN   )rB   rb   r   rK   r"   typingr   r   Ztokenization_utilsr   r   r   r   utilsr	   Z
get_loggerr?   r_   r   r   r   r   objectr   rF   rM   rN   r   r   r   r   <module>   s.   
 
 "