U
    9%e9                  	   @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlmZmZ ejeZejedZeeZe ZW 5 Q R X G dd dZe	e Ze	ee  ZG dd	 d	e
ZG d
d dZG dd dZG dd dZ efe	e! e!dddZ"dS )    N)Template)AnyCallableDictList
NamedTupleOptionalTuple)Encoding	Tokenizerzvisualizer-styles.cssc                   @   s8   e Zd ZU eed< eed< eed< eeedddZdS )
Annotationstartendlabelr   r   r   c                 C   s   || _ || _|| _d S Nr   )selfr   r   r    r   Z/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/tokenizers/tools/visualizer.py__init__   s    zAnnotation.__init__N)__name__
__module____qualname__int__annotations__strr   r   r   r   r   r      s   
r   c                   @   s&   e Zd ZU ee ed< ee ed< dS )CharStateKeytoken_ixanno_ixN)r   r   r   r   r   r   r   r   r   r   r      s   
r   c                   @   sH   e Zd ZU ee ed< dd Zedd Zedd Z	e
dd	d
ZdS )	CharStatechar_ixc                 C   s   || _ d | _g | _d S r   )r    r   tokens)r   r    r   r   r   r   '   s    zCharState.__init__c                 C   s   t | jdkr| jd S d S )Nr   lenr!   r   r   r   r   r   -   s    zCharState.token_ixc                 C   s   t | jdkS )zJ
        BPE tokenizers can output more than one token for a char
           r"   r$   r   r   r   is_multitoken1   s    zCharState.is_multitoken)returnc                 C   s   t | j| jdS )N)r   r   )r   r   r   r$   r   r   r   partition_key8   s    zCharState.partition_keyN)r   r   r   r   r   r   r   propertyr   r&   r   r(   r   r   r   r   r   $   s   


r   c                   @   s   e Zd ZdS )AlignedN)r   r   r   r   r   r   r   r*   ?   s   r*   c                   @   s   e Zd ZdZejdejdZdee	e
eegef  dddZg dfeee
e	 e
e d	d
dZeeeeef dddZeee eedddZeeeeedddZeeeedddZeeeeee dddZdS )EncodingVisualizera  
    Build an EncodingVisualizer

    Args:

         tokenizer (:class:`~tokenizers.Tokenizer`):
            A tokenizer instance

         default_to_notebook (:obj:`bool`):
            Whether to render html output in a notebook by default

         annotation_converter (:obj:`Callable`, `optional`):
            An optional (lambda) function that takes an annotation in any format and returns
            an Annotation object
    z(.{1})?(unk|oov)(.{1})?)flagsTN)	tokenizerdefault_to_notebookannotation_converterc              
   C   sZ   |rDzddl m}m} W n* tk
rB } ztdW 5 d }~X Y nX || _|| _|| _d S )Nr   HTMLdisplayzWe couldn't import IPython utils for html display.
                        Are you running in a notebook?
                        You can also pass `default_to_notebook=False` to get back raw HTML
                    )IPython.core.displayr1   r2   ImportError	Exceptionr-   r.   annotation_coverter)r   r-   r.   r/   r1   r2   er   r   r   r   V   s    zEncodingVisualizer.__init__)textannotationsr.   r'   c           
   
   C   s   | j }|dk	r|}|rVzddlm}m} W n* tk
rT } ztdW 5 d}~X Y nX | jdk	rptt| j|}| j	
|}t|||}	|r|||	 n|	S dS )a  
        Build a visualization of the given text

        Args:
            text (:obj:`str`):
                The text to tokenize

            annotations (:obj:`List[Annotation]`, `optional`):
                An optional list of annotations of the text. The can either be an annotation class
                or anything else if you instantiated the visualizer with a converter function

            default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
                If True, will render the html in a notebook. Otherwise returns an html string.

        Returns:
            The HTML string if default_to_notebook is False, otherwise (default) returns None and
            renders the HTML in the notebook

        Nr   r0   zeWe couldn't import IPython utils for html display.
                    Are you running in a notebook?)r.   r3   r1   r2   r4   r5   r6   listmapr-   encoder+   _EncodingVisualizer__make_html)
r   r8   r9   r.   Zfinal_default_to_notebookr1   r2   r7   encodinghtmlr   r   r   __call__l   s"    
zEncodingVisualizer.__call__)r9   r'   c           	      C   s   t | dkri S ttdd | }t |}td| }|dk rBd}d}d}d}i }t|D ](}d	| d
| d| d||< ||7 }qZ|S )a  
        Generates a color palette for all the labels in a given set of annotations

        Args:
          annotations (:obj:`Annotation`):
            A list of annotations

        Returns:
            :obj:`dict`: A dictionary mapping labels to colors in HSL format
        r   c                 S   s   | j S r   )r   )xr   r   r   <lambda>       z;EncodingVisualizer.calculate_label_colors.<locals>.<lambda>          @   
   zhsl(,z%,%)r#   setr;   r   sorted)	r9   labelsZ
num_labelsZh_stepslhcolorsr   r   r   r   calculate_label_colors   s    
z)EncodingVisualizer.calculate_label_colors)consecutive_chars_listr8   r>   c                 C   s.  | d }|j dkr*|j|j }d| dS | d }|j }|j d }||| }g }	i }
|jdk	r|	d |jrz|	d |jd	 r|	d
 n
|	d tj|j|j dk	r|	d |j|j |
d< n
|	d dd|	 d}d}|
	 D ]\}}|d| d| d7 }qd| d| d| dS )a  
        Converts a list of "consecutive chars" into a single HTML element.
        Chars are consecutive if they fall under the same word, token and annotation.
        The CharState class is a named tuple with a "partition_key" method that makes it easy to
        compare if two chars are consecutive.

        Args:
            consecutive_chars_list (:obj:`List[CharState]`):
                A list of CharStates that have been grouped together

            text (:obj:`str`):
                The original text being processed

            encoding (:class:`~tokenizers.Encoding`):
                The encoding returned from the tokenizer

        Returns:
            :obj:`str`: The HTML span for a set of consecutive chars
        r   Nz(<span class="special-token" data-stoken=z></span>r%   tokenzmulti-token   z	odd-tokenz
even-tokenzspecial-tokenZstokz	non-tokenzclass=" " z data-z="z<span z ></span>)
r    r!   r   appendr&   r+   unk_token_regexsearchjoinitems)rS   r8   r>   firstZstokenlastr   r   Z	span_textZcss_classesZ
data_itemscssdatakeyvalr   r   r   consecutive_chars_to_html   s4    








z,EncodingVisualizer.consecutive_chars_to_html)r8   r>   r9   r'   c                 C   sX  t | ||}|d g}|d j}g }t |}|d j}|d k	rp|| }	|	j}
||
 }|d| d|
 d |dd  D ]}|j}||kr|t j|| |d |g}|d k	r|d |d k	r|| }	|	j}
||
 }|d| d|
 d |}| |d  kr|| q||t j|| |d |g}q||t j|| |d t|}|S )Nr   z&<span class="annotation" style="color:z" data-label="z">r%   )r8   r>   rZ   )	r+   %_EncodingVisualizer__make_char_statesr   rR   r   r[   rf   r(   HTMLBody)r8   r>   r9   char_statesZcurrent_consecutive_charsZprev_anno_ixspansZlabel_colors_dictZcur_anno_ixannor   colorcsresr   r   r   Z__make_html   sb    




zEncodingVisualizer.__make_html)r8   r9   r'   c                 C   s@   dgt |  }t|D ]$\}}t|j|jD ]}|||< q,q|S )a  
        Args:
            text (:obj:`str`):
                The raw text we want to align to

            annotations (:obj:`AnnotationList`):
                A (possibly empty) list of annotations

        Returns:
            A list of  length len(text) whose entry at index i is None if there is no annotation on
            charachter i or k, the index of the annotation that covers index i where k is with
            respect to the list of annotations
        N)r#   	enumerateranger   r   )r8   r9   annotation_mapr   air   r   r   Z__make_anno_map<  s
    z"EncodingVisualizer.__make_anno_mapc                 C   s   t | |}dd tt| D }t|jD ]B\}}||}|dk	r,|\}}	t||	D ]}
||
 j| qXq,t|D ]\}}||| _qx|S )a  
        For each character in the original text, we emit a tuple representing it's "state":

            * which token_ix it corresponds to
            * which word_ix it corresponds to
            * which annotation_ix it corresponds to

        Args:
            text (:obj:`str`):
                The raw text we want to align to

            annotations (:obj:`List[Annotation]`):
                A (possibly empty) list of annotations

            encoding: (:class:`~tokenizers.Encoding`):
                The encoding returned from the tokenizer

        Returns:
            :obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
            it's state is
        c                 S   s   g | ]}t |qS r   )r   ).0r    r   r   r   
<listcomp>j  s     z9EncodingVisualizer.__make_char_states.<locals>.<listcomp>N)	r+   "_EncodingVisualizer__make_anno_maprp   r#   ro   r!   Ztoken_to_charsr[   r   )r8   r>   r9   rq   ri   r   rU   offsetsr   r   rs   r    r   r   r   r   Z__make_char_statesQ  s    
z%EncodingVisualizer.__make_char_states)TN)r   r   r   __doc__recompile
IGNORECASEr\   r   boolr   r   r   r   r   r   AnnotationListr@   staticmethodr   rR   r   r   r
   rf   r=   PartialIntListrv   rg   r   r   r   r   r+   C   s<     -CAr+   )childrenr'   c                 C   s   d | }d| d| dS )a[  
    Generates the full html with css from a list of html spans

    Args:
        children (:obj:`List[str]`):
            A list of strings, assumed to be html elements

        css_styles (:obj:`str`, `optional`):
            Optional alternative implementation of the css

    Returns:
        :obj:`str`: An HTML string with style markup
    rY   z?
    <html>
        <head>
            <style>
                zs
            </style>
        </head>
        <body>
            <div class="tokenized-text" dir=auto>
            z4
            </div>
        </body>
    </html>
    )r^   )r   Z
css_stylesZchildren_textr   r   r   rh   w  s    
	rh   )#	itertoolsosry   stringr   typingr   r   r   r   r   r   r	   Z
tokenizersr
   r   pathdirname__file__r^   Zcss_filenameopenfreadrb   r   r}   r   r   r   r   r*   r+   r   rh   r   r   r   r   <module>   s&   $
  6