U
    ,-eEk                  	   @   sR  d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
Z
ddlmZ ddlmZ eeZdd	d
Zddiddid
ZddiZdd ZG dd deZdZdZededdddddf	Ze
dde e
je
jB e
jB Ze
dZ e
ee
je
jB e
jB Z!e
dZ"d1d!d"Z#d2d&d'Z$G d(d) d)Z%d*d+ Z&d,d- Z'd3d/d0Z(dS )4z" Tokenization classes for BERTweet    N)copyfile)ListOptionalTuple   )PreTrainedTokenizer)loggingz	vocab.txtz	bpe.codes)
vocab_filemerges_filezvinai/bertweet-basezAhttps://huggingface.co/vinai/bertweet-base/resolve/main/vocab.txtzAhttps://huggingface.co/vinai/bertweet-base/resolve/main/bpe.codes   c                 C   s>   t  }| d }| dd D ]}|||f |}qt |}|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r      N)setadd)wordpairsZ	prev_charchar r   s/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/bertweet/tokenization_bertweet.py	get_pairs4   s    r   c                       s  e Zd ZdZeZeZeZ	d* fdd		Z
d+ee eee  ee dddZd,ee eee  eee d fddZd-ee eee  ee dddZedd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd.eee ee d%d&d'Zd(d) Z  Z S )/BertweetTokenizera	  
    Constructs a BERTweet tokenizer, using Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        normalization (`bool`, *optional*, defaults to `False`)
            Whether or not to apply a normalization preprocess.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    F<s></s><unk><pad><mask>c                    s2  zddl m} || _W n$ tk
r:   td d | _Y nX || _|| _i | _d| j|< d| j|	< d| j|< d| j|< | 	| dd | j
 D | _t|d	d
}| dd d }W 5 Q R X dd |D }tt|tt|| _i | _|| _t | _ddd| _t jf |||||||	|
d| d S )Nr   )demojizezsemoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0r      r   c                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>   s      z.BertweetTokenizer.__init__.<locals>.<dictcomp>utf-8encoding
c                 S   s    g | ]}t | d d qS )Nr%   )tuplesplit)r   merger   r   r   
<listcomp>   s     z.BertweetTokenizer.__init__.<locals>.<listcomp>'z...)u   ’u   …)normalization	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token
mask_token)emojir   	demojizerImportErrorloggerwarningr	   r
   encoderadd_from_fileitemsdecoderopenreadr'   dictziprangelen	bpe_rankscacher+   TweetTokenizertweetPreprocessorspecial_punctssuper__init__)selfr	   r
   r+   r,   r-   r.   r/   r0   r1   r2   kwargsr   Zmerges_handleZmerges	__class__r   r   rH   {   sJ    





 	zBertweetTokenizer.__init__N)token_ids_0token_ids_1returnc                 C   sD   |dkr| j g| | jg S | j g}| jg}|| | | | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERTweet sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)cls_token_idsep_token_id)rI   rM   rN   clssepr   r   r    build_inputs_with_special_tokens   s
    z2BertweetTokenizer.build_inputs_with_special_tokens)rM   rN   already_has_special_tokensrO   c                    sh   |rt  j||ddS |dkr8dgdgt|  dg S dgdgt|  ddg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rM   rN   rU   Nr   r   )rG   get_special_tokens_maskrA   )rI   rM   rN   rU   rK   r   r   rV      s      z)BertweetTokenizer.get_special_tokens_maskc                 C   sP   | j g}| jg}|dkr.t|| | dg S t|| | | | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. BERTweet does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )rQ   rP   rA   )rI   rM   rN   rS   rR   r   r   r   $create_token_type_ids_from_sequences   s
    z6BertweetTokenizer.create_token_type_ids_from_sequencesc                 C   s
   t | jS N)rA   r8   rI   r   r   r   
vocab_size  s    zBertweetTokenizer.vocab_sizec                 C   s   t | jf| jS rX   )r>   r8   Zadded_tokens_encoderrY   r   r   r   	get_vocab	  s    zBertweetTokenizer.get_vocabc           
         s  | j kr j | S t|}tt|d d |d d g }t|}|sN|S t| fddd}| jkrpqn|\}}g }d}|t|k rDz|||}	W n, tk
r   |	||d   Y qDY nX |	|||	  |	}|| |kr,|t|d k r,||d  |kr,|
||  |d7 }q|
||  |d7 }qt|}|}t|dkrdqnqNt|}qNd	|}|d d
 }| j |< |S )Nr%   z</w>c                    s    j | tdS )Ninf)rB   getfloat)pairrY   r   r   <lambda>      z'BertweetTokenizer.bpe.<locals>.<lambda>)keyr   r   r   @@ )rC   r&   listr   minrB   rA   index
ValueErrorextendappendjoin)
rI   tokenr   r   ZbigramfirstsecondZnew_wordijr   rY   r   bpe  sF    

"
2




zBertweetTokenizer.bpec                 C   sH   | j r| |}g }td|}|D ]}|t| |d q$|S )zTokenize a string.z\S+\n? )r+   normalizeTweetrefindallri   re   rq   r'   )rI   textZsplit_tokenswordsrl   r   r   r   	_tokenize8  s    
zBertweetTokenizer._tokenizec                    s    j D ]}|| j | }q j|}d fdd|D }|ddddddd	d
dd}|dddddddddddd}|dddddddd }d| S )!z'
        Normalize a raw Tweet
        rr   c                    s   g | ]}  |qS r   )normalizeToken)r   rl   rY   r   r   r)   K  s     z4BertweetTokenizer.normalizeTweet.<locals>.<listcomp>zcannot zcan not zn't z n't zn 't zca n'tzcan'tzai n'tzain'tz'm z 'm z're z 're z's z 's z'll z 'll z'd z 'd z've z 've z p . m .z  p.m.z p . m z p.m z a . m .z a.m.z a . m z a.m )rF   replacerE   tokenizerk   r'   )rI   ZtweetpuncttokensZ	normTweetr   rY   r   rs   C  s^    
         	   z BertweetTokenizer.normalizeTweetc                 C   sp   |  }|drdS |ds*|dr.dS t|dkrh|| jkrN| j| S | jdk	rb| |S |S n|S dS )z-
        Normalize tokens in a Tweet
        @z@USERhttpZwwwZHTTPURLr   N)lower
startswithrA   rF   r4   )rI   rl   Zlowercased_tokenr   r   r   ry   e  s    




z BertweetTokenizer.normalizeTokenc                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)r8   r]   r0   )rI   rl   r   r   r   _convert_token_to_idx  s    z&BertweetTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)r;   r]   r0   )rI   rg   r   r   r   _convert_id_to_token|  s    z&BertweetTokenizer._convert_id_to_tokenc                 C   s   d |dd }|S )z:Converts a sequence of tokens (string) in a single string.rr   rc    )rk   rz   strip)rI   r}   Z
out_stringr   r   r   convert_tokens_to_string  s    z*BertweetTokenizer.convert_tokens_to_string)save_directoryfilename_prefixrO   c              	   C   s  t j|s"td| d d S t j||r6|d ndtd  }t j||rX|d ndtd  }t j| jt j|krt j	| jrt
| j| n8t j	| jst|d}| j }|| W 5 Q R X t j| jt j|krt
| j| ||fS )NzVocabulary path (z) should be a directory-r   r	   r
   wb)ospathisdirr6   errorrk   VOCAB_FILES_NAMESabspathr	   isfiler   r<   Zsp_modelZserialized_model_protowriter
   )rI   r   r   Zout_vocab_fileZout_merge_filefiZcontent_spiece_modelr   r   r   save_vocabulary  s(      (
z!BertweetTokenizer.save_vocabularyc           	   
   C   s   t |trz(t|ddd}| | W 5 Q R X W nH tk
rX } z|W 5 d}~X Y n$ tk
rz   td| dY nX dS | }|D ]B}| }|	d}|dkrt
d	|d| }t| j| j|< qdS )
zi
        Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
        rr!   r"   NzIncorrect encoding detected in z, please rebuild the datasetrr   r%   z5Incorrect dictionary format, expected '<token> <cnt>')
isinstancestrr<   r9   FileNotFoundErrorUnicodeError	Exception	readlinesr   rfindrh   rA   r8   )	rI   ffdZfnfelinesZlineTmplineidxr   r   r   r   r9     s"    

zBertweetTokenizer.add_from_file)Fr   r   r   r   r   r   r   )N)NF)N)N)!__name__
__module____qualname____doc__r   Zvocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESZmax_model_input_sizesrH   r   intr   rT   boolrV   rW   propertyrZ   r[   rq   rx   rs   ry   r   r   r   r   r   r   r9   __classcell__r   r   rK   r   r   D   sZ   2        =  
    
   

,"r   ac  
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
      |
      <3                         # heart
    )u  			# Capture 1: entire matched URL
  (?:
  https?:				# URL protocol and colon
    (?:
      /{1,3}				# 1-3 slashes
      |					#   or
      [a-z0-9%]				# Single letter or digit or '%'
                                       # (Trying not to match e.g. "URI::Escape")
    )
    |					#   or
                                       # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    (?:[a-z]{2,13})
    /
  )
  (?:					# One or more:
    [^\s()<>{}\[\]]+			# Run of non-space, non-()<>{}[]
    |					#   or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
  )+
  (?:					# End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
    |					#   or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’]	# not a space or one of these punct chars
  )
  |					# OR, the following to match naked domains:
  (?:
    (?<!@)			        # not preceded by a @, avoid matching foo@_gmail.com_
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    [.]
    (?:[a-z]{2,13})
    \b
    /?
    (?!@)			        # not succeeded by a @,
                            # avoid matching "foo.na" in "foo.na@example.com"
  )
a	  
    (?:
      (?:            # (international)
        \+?[01]
        [ *\-.\)]*
      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [ *\-.\)]*
      )?
      \d{3}          # exchange
      [ *\-.\)]*
      \d{4}          # base
    )z	<[^>\s]+>z[\-]+>|<[\-]+z(?:@[\w_]+)z(?:\#+[\w_]+[\w\'_\-]*[\w_]+)z#[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]a  
    (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace.
    z(%s)|z([^a-zA-Z0-9])\1{3,}z&(#?(x?))([^&;\s]+);strictc                 C   s&   |d krd}t | tr"| ||S | S )Nr!   )r   bytesdecode)rv   r#   errorsr   r   r   _str_to_unicodel  s
    
r   r   Tr!   c                    s     fdd}t |t| |S )u  
    Remove entities from text by converting them to their corresponding unicode character.

    Args:
        text:
            A unicode string or a byte string encoded in the given *encoding* (which defaults to 'utf-8').
        keep (list):
            List of entity names which should not be replaced. This supports both numeric entities (`&#nnnn;` and
            `&#hhhh;`) and named entities (such as `&nbsp;` or `&gt;`).
        remove_illegal (bool):
            If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are
            kept "as is".

    Returns: A unicode string with the entities removed.

    See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py

    Examples:

    ```python
    >>> from nltk.tokenize.casual import _replace_html_entities

    >>> _replace_html_entities(b"Price: &pound;100")
    'Price: \xa3100'

    >>> print(_replace_html_entities(b"Price: &pound;100"))
    Price: £100
    ```c              	      s   |  d}|  dr~zN|  dr,t|d}n
t|d}d|  krJdkr`n nt|fdW S W q tk
rz   d }Y qX n | kr|  d	S tjj|}|d k	rz
t	|W S  tt
fk
r   Y nX rd
S |  d	S )Nr   r   r      
   r      cp1252r   r   )groupr   r   r   rh   htmlentitiesname2codepointr]   chrOverflowError)matchZentity_bodynumberkeepremove_illegalr   r   _convert_entity  s&    





z/_replace_html_entities.<locals>._convert_entity)ENT_REsubr   )rv   r   r   r#   r   r   r   r   _replace_html_entitiest  s    r   c                   @   s"   e Zd ZdZd	ddZdd ZdS )
rD   a  
    Examples:

    ```python
    >>> # Tokenizer for tweets.
    >>> from nltk.tokenize import TweetTokenizer

    >>> tknzr = TweetTokenizer()
    >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
    >>> tknzr.tokenize(s0)
    ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']

    >>> # Examples using *strip_handles* and *reduce_len parameters*:
    >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
    >>> s1 = "@remy: This is waaaaayyyy too much for you!!!!!!"
    >>> tknzr.tokenize(s1)
    [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
    ```TFc                 C   s   || _ || _|| _d S rX   preserve_case
reduce_lenstrip_handles)rI   r   r   r   r   r   r   rH     s    zTweetTokenizer.__init__c                 C   sR   t |}| jrt|}| jr$t|}td|}t|}| j	sNdd |D }|S )z
        Args:
            text: str

        Returns: list(str) A tokenized list of strings; concatenating this list returns the original string if
        `preserve_case=False`
        \1\1\1c                 S   s"   g | ]}t |r|n| qS r   )EMOTICON_REsearchr   )r   xr   r   r   r)     s     z+TweetTokenizer.tokenize.<locals>.<listcomp>)
r   r   remove_handlesr   reduce_lengtheningHANG_REr   WORD_REru   r   )rI   rv   Z	safe_textrw   r   r   r   r{     s    	
zTweetTokenizer.tokenizeN)TFF)r   r   r   r   rH   r{   r   r   r   r   rD     s   
rD   c                 C   s   t d}|d| S )za
    Replace repeated character sequences of length 3 or greater with sequences of length 3.
    z	(.)\1{2,}r   regexcompiler   rv   patternr   r   r   r     s    
r   c                 C   s   t d}|d| S )z4
    Remove Twitter username handles from text.
    zv(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)rr   r   r   r   r   r   r     s    r   Fc                 C   s   t |||d| S )z:
    Convenience function for wrapping the tokenizer.
    r   )rD   r{   )rv   r   r   r   r   r   r   casual_tokenize  s    r   )Nr   )r   Tr!   )TFF))r   r   r   rt   shutilr   typingr   r   r   r   Ztokenization_utilsr   utilsr   Z
get_loggerr   r6   r   r   r   r   r   Z	EMOTICONSZURLSZREGEXPSr   rk   VERBOSEIUNICODEr   r   r   r   r   r   rD   r   r   r   r   r   r   r   <module>   sb   
  
    *.0$



A8