U
    9%e;h                     @   s   d dl Z d dlZd dlmZmZmZmZ d dlZddl	m
Z
 ddlmZmZmZmZ ddlmZmZmZmZ e rd dlZddlmZ e rdd	lmZ G d
d deZG dd deZeedG dd deZeZdS )    N)ListOptionalTupleUnion   )BasicTokenizer)ExplicitEnumadd_end_docstringsis_tf_availableis_torch_available   )PIPELINE_INIT_ARGSArgumentHandlerChunkPipelineDataset)/TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES),MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMESc                   @   s*   e Zd ZdZeeee f dddZdS )"TokenClassificationArgumentHandlerz5
    Handles arguments for token classification.
    inputsc                 K   s   |d k	r4t |ttfr4t|dkr4t|}t|}nDt |trJ|g}d}n.td k	r\t |tsht |tjrp|d fS td|	d}|rt |trt |d tr|g}t||krtd||fS )Nr   r   zAt least one input is required.offset_mappingz;offset_mapping should have the same batch size as the input)

isinstancelisttuplelenstrr   typesGeneratorType
ValueErrorget)selfr   kwargsZ
batch_sizer    r"   j/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/pipelines/token_classification.py__call__   s     "


z+TokenClassificationArgumentHandler.__call__N)__name__
__module____qualname____doc__r   r   r   r$   r"   r"   r"   r#   r      s   r   c                   @   s$   e Zd ZdZdZdZdZdZdZdS )AggregationStrategyzDAll the valid aggregation strategies for TokenClassificationPipelinenonesimplefirstZaveragemaxN)	r%   r&   r'   r(   NONESIMPLEFIRSTAVERAGEMAXr"   r"   r"   r#   r)   3   s   r)   a
  
        ignore_labels (`List[str]`, defaults to `["O"]`):
            A list of labels to ignore.
        grouped_entities (`bool`, *optional*, defaults to `False`):
            DEPRECATED, use `aggregation_strategy` instead. Whether or not to group the tokens corresponding to the
            same entity together in the predictions or not.
        stride (`int`, *optional*):
            If stride is provided, the pipeline is applied on all the text. The text is split into chunks of size
            model_max_length. Works only with fast tokenizers and `aggregation_strategy` different from `NONE`. The
            value of this argument defines the number of overlapping tokens between chunks. In other words, the model
            will shift forward by `tokenizer.model_max_length - stride` tokens each step.
        aggregation_strategy (`str`, *optional*, defaults to `"none"`):
            The strategy to fuse (or not) tokens based on the model prediction.

                - "none" : Will simply not do any aggregation and simply return raw results from the model
                - "simple" : Will attempt to group entities following the default schema. (A, B-TAG), (B, I-TAG), (C,
                  I-TAG), (D, B-TAG2) (E, B-TAG2) will end up being [{"word": ABC, "entity": "TAG"}, {"word": "D",
                  "entity": "TAG2"}, {"word": "E", "entity": "TAG2"}] Notice that two consecutive B tags will end up as
                  different entities. On word based languages, we might end up splitting words undesirably : Imagine
                  Microsoft being tagged as [{"word": "Micro", "entity": "ENTERPRISE"}, {"word": "soft", "entity":
                  "NAME"}]. Look for FIRST, MAX, AVERAGE for ways to mitigate that and disambiguate words (on languages
                  that support that meaning, which is basically tokens separated by a space). These mitigations will
                  only work on real words, "New york" might still be tagged with two different entities.
                - "first" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
                  end up with different tags. Words will simply use the tag of the first token of the word when there
                  is ambiguity.
                - "average" : (works only on word based models) Will use the `SIMPLE` strategy except that words,
                  cannot end up with different tags. scores will be averaged first across tokens, and then the maximum
                  label is applied.
                - "max" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
                  end up with different tags. Word entity will simply be the token with the maximum score.
    c                	       sl  e Zd ZdZdZe f fdd	Zd'ee ee ee	 ee
eeef   ee dddZeee
e f d	 fd
dZd(ddZdd Ze	jdfddZdd Zeejejee
eeef   eje	e
e dddZe
e e	e
e dddZe
e e	edddZe
e e	e
e dddZe
e edd d!Zeeeef d"d#d$Ze
e e
e dd%d&Z  Z S ))TokenClassificationPipelineuv	  
    Named Entity Recognition pipeline using any `ModelForTokenClassification`. See the [named entity recognition
    examples](../task_summary#named-entity-recognition) for more information.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> token_classifier = pipeline(model="Jean-Baptiste/camembert-ner", aggregation_strategy="simple")
    >>> sentence = "Je m'appelle jean-baptiste et je vis à montréal"
    >>> tokens = token_classifier(sentence)
    >>> tokens
    [{'entity_group': 'PER', 'score': 0.9931, 'word': 'jean-baptiste', 'start': 12, 'end': 26}, {'entity_group': 'LOC', 'score': 0.998, 'word': 'montréal', 'start': 38, 'end': 47}]

    >>> token = tokens[0]
    >>> # Start and end provide an easy way to highlight words in the original text.
    >>> sentence[token["start"] : token["end"]]
    ' jean-baptiste'

    >>> # Some models use the same idea to do part of speech.
    >>> syntaxer = pipeline(model="vblagoje/bert-english-uncased-finetuned-pos", aggregation_strategy="simple")
    >>> syntaxer("My name is Sarah and I live in London")
    [{'entity_group': 'PRON', 'score': 0.999, 'word': 'my', 'start': 0, 'end': 2}, {'entity_group': 'NOUN', 'score': 0.997, 'word': 'name', 'start': 3, 'end': 7}, {'entity_group': 'AUX', 'score': 0.994, 'word': 'is', 'start': 8, 'end': 10}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'sarah', 'start': 11, 'end': 16}, {'entity_group': 'CCONJ', 'score': 0.999, 'word': 'and', 'start': 17, 'end': 20}, {'entity_group': 'PRON', 'score': 0.999, 'word': 'i', 'start': 21, 'end': 22}, {'entity_group': 'VERB', 'score': 0.998, 'word': 'live', 'start': 23, 'end': 27}, {'entity_group': 'ADP', 'score': 0.999, 'word': 'in', 'start': 28, 'end': 30}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'london', 'start': 31, 'end': 37}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This token recognition pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous).

    The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the
    up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=token-classification).
    	sequencesc                    s<   t  j|| | | jdkr tnt tdd| _|| _d S )NtfF)Zdo_lower_case)	super__init__Zcheck_model_type	frameworkr   r   r   Z_basic_tokenizer_args_parser)r    Zargs_parserargsr!   	__class__r"   r#   r7      s    z$TokenClassificationPipeline.__init__N)grouped_entitiesignore_subwordsaggregation_strategyr   stridec           
      C   sJ  i }|d k	r||d< i }|d k	s(|d k	r|r8|r8t j}n|rH|sHt j}nt j}|d k	rhtd| d |d k	rtd| d |d k	rt|trt |  }|t jt j	t j
hkr| jjstd||d< |d k	r||d< |d k	r@|| jjkrtd|t jkrtd	| d
n(| jjr8dd|d}	|	|d< ntd|i |fS )Nr   zl`grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to `aggregation_strategy="z"` instead.zk`ignore_subwords` is deprecated and will be removed in version v5.0.0, defaulted to `aggregation_strategy="z{Slow tokenizers cannot handle subwords. Please set the `aggregation_strategy` option to `"simple"` or use a fast tokenizer.r?   ignore_labelszl`stride` must be less than `tokenizer.model_max_length` (or even lower if the tokenizer adds special tokens)zI`stride` was provided to process all the text but `aggregation_strategy="z&"`, please select another one instead.T)Zreturn_overflowing_tokenspaddingr@   tokenizer_paramszm`stride` was provided to process all the text but you're using a slow tokenizer. Please use a fast tokenizer.)r)   r0   r/   r.   warningswarnr   r   upperr2   r1   	tokenizeris_fastr   model_max_length)
r    rA   r=   r>   r?   r   r@   preprocess_paramsZpostprocess_paramsrC   r"   r"   r#   _sanitize_parameters   sh    	






z0TokenClassificationPipeline._sanitize_parametersr   c                    s.   | j |f|\}}|r||d< t j|f|S )a  
        Classify each token of the text(s) given as inputs.

        Args:
            inputs (`str` or `List[str]`):
                One or several texts (or one list of texts) for token classification.

        Return:
            A list or a list of list of `dict`: Each result comes as a list of dictionaries (one for each token in the
            corresponding input, or each entity if this pipeline was instantiated with an aggregation_strategy) with
            the following keys:

            - **word** (`str`) -- The token/word classified. This is obtained by decoding the selected tokens. If you
              want to have the exact string in the original sentence, use `start` and `end`.
            - **score** (`float`) -- The corresponding probability for `entity`.
            - **entity** (`str`) -- The entity predicted for that token/word (it is named *entity_group* when
              *aggregation_strategy* is not `"none"`.
            - **index** (`int`, only present when `aggregation_strategy="none"`) -- The index of the corresponding
              token in the sentence.
            - **start** (`int`, *optional*) -- The index of the start of the corresponding entity in the sentence. Only
              exists if the offsets are available within the tokenizer
            - **end** (`int`, *optional*) -- The index of the end of the corresponding entity in the sentence. Only
              exists if the offsets are available within the tokenizer
        r   )r9   r6   r$   )r    r   r!   Z_inputsr   r;   r"   r#   r$      s    z$TokenClassificationPipeline.__call__c           	      +   s   | di }| jjr$| jjdkr$dnd}| j|f| j|d| jjd|}| dd  t|d }t|D ]v | jdkr fd	d
| D }n fdd
| D }|d k	r||d<  dkr|nd |d<  |d k|d< |V  qjd S )NrC   r   TF)Zreturn_tensors
truncationZreturn_special_tokens_maskZreturn_offsets_mappingZoverflow_to_sample_mapping	input_idsr5   c                    s"   i | ]\}}|t |  d qS r   )r5   Zexpand_dims.0kvir"   r#   
<dictcomp>  s      z:TokenClassificationPipeline.preprocess.<locals>.<dictcomp>c                    s    i | ]\}}||   d qS rN   )Z	unsqueezerO   rS   r"   r#   rU     s      r   sentencer   is_last)poprG   rI   r8   rH   r   rangeitems)	r    rV   r   rJ   rC   rL   r   
num_chunksmodel_inputsr"   rS   r#   
preprocess   s.    
z&TokenClassificationPipeline.preprocessc                 C   s   | d}| dd }| d}| d}| jdkrF| jf |d }n&| jf |}t|trd|d n|d }|||||d|S )	Nspecial_tokens_maskr   rV   rW   r5   r   logits)r_   r^   r   rV   rW   )rX   r8   modelr   dict)r    r\   r^   r   rV   rW   r_   outputr"   r"   r#   _forward  s     



z$TokenClassificationPipeline._forwardc              	      s"   d krdg g }|D ]}|d d   }|d d }|d d }|d d k	rZ|d d nd }	|d d   }
tj|dd	d
}t|| }||jdd	d
 }| jdkr|  }|	d k	r|	  nd }	| ||||	|
|}| ||} fdd|D }|| qt	|}|dkr| 
|}|S )NOr_   r   rV   rM   r   r^   T)axisZkeepdimsr5   c                    s0   g | ](}| d d kr| dd kr|qS )entityNentity_group)r   rP   rg   rA   r"   r#   
<listcomp>D  s   z;TokenClassificationPipeline.postprocess.<locals>.<listcomp>r   )numpynpr-   expsumr8   gather_pre_entities	aggregateextendr   aggregate_overlapping_entities)r    Zall_outputsr?   rA   Zall_entitiesZmodel_outputsr_   rV   rM   r   r^   ZmaxesZshifted_expscorespre_entitiesr=   entitiesr[   r"   rj   r#   postprocess*  sB    
     


z'TokenClassificationPipeline.postprocessc                 C   s   t |dkr|S t|dd d}g }|d }|D ]}|d |d   krT|d k rn nL|d |d  }|d |d  }||kr|}q||kr|d |d kr|}q0|| |}q0|| |S )Nr   c                 S   s   | d S )Nstartr"   )xr"   r"   r#   <lambda>S      zLTokenClassificationPipeline.aggregate_overlapping_entities.<locals>.<lambda>keyrx   endscore)r   sortedappend)r    rv   Zaggregated_entitiesZprevious_entityrg   Zcurrent_lengthZprevious_lengthr"   r"   r#   rs   P  s"    $

z:TokenClassificationPipeline.aggregate_overlapping_entities)rV   rM   rt   r   r^   r?   returnc                 C   s:  g }t |D ]&\}}	|| r q| jt|| }
|dk	r|| \}}t|tsn| jdkrn| }| }||| }t| jddrt| jjj	ddrt
|
t
|k}n@|tjtjtjhkrtdt |dkod||d |d  k}t|| | jjkr|}
d	}nd}d}d	}|
|	||||d
}|| q|S )zTFuse various numpy arrays into dicts with all the information needed for aggregationNpt
_tokenizerZcontinuing_subword_prefixz?Tokenizer does not support real words, using fallback heuristicr    r   F)wordrt   rx   r~   index
is_subword)	enumeraterG   Zconvert_ids_to_tokensintr   r8   itemgetattrr   r`   r   r)   r0   r1   r2   rD   rE   UserWarningZunk_token_idr   )r    rV   rM   rt   r   r^   r?   ru   idxZtoken_scoresr   Z	start_indZend_indZword_refr   
pre_entityr"   r"   r#   rp   d  sV    



   z/TokenClassificationPipeline.gather_pre_entities)ru   r?   r   c                 C   s   |t jt jhkrng }|D ]R}|d  }|d | }| jjj| ||d |d |d |d d}|| qn| ||}|t jkr|S | 	|S )Nrt   r   r   rx   r~   )rg   r   r   r   rx   r~   )
r)   r.   r/   argmaxr`   configid2labelr   aggregate_wordsgroup_entities)r    ru   r?   rv   r   
entity_idxr   rg   r"   r"   r#   rq     s"    
z%TokenClassificationPipeline.aggregate)rv   r?   r   c                 C   s  | j dd |D }|tjkrL|d d }| }|| }| jjj| }n|tjkrt	|dd d}|d }| }|| }| jjj| }nT|tj
krtdd |D }tj|dd	}	|	 }
| jjj|
 }|	|
 }ntd
||||d d |d d d}|S )Nc                 S   s   g | ]}|d  qS r   r"   ri   r"   r"   r#   rk     s     z>TokenClassificationPipeline.aggregate_word.<locals>.<listcomp>r   rt   c                 S   s   | d   S )Nrt   )r-   )rg   r"   r"   r#   rz     r{   z<TokenClassificationPipeline.aggregate_word.<locals>.<lambda>r|   c                 S   s   g | ]}|d  qS )rt   r"   ri   r"   r"   r#   rk     s     )rf   zInvalid aggregation_strategyrx   re   r~   )rg   r   r   rx   r~   )rG   convert_tokens_to_stringr)   r0   r   r`   r   r   r2   r-   r1   rm   stacknanmeanr   )r    rv   r?   r   rt   r   r   rg   Z
max_entityZaverage_scoresr   Z
new_entityr"   r"   r#   aggregate_word  s4    





z*TokenClassificationPipeline.aggregate_wordc                 C   s   |t jt jhkrtdg }d}|D ]@}|dkr8|g}q$|d rL|| q$|| || |g}q$|dk	r|| || |S )z
        Override tokens from a given word that disagree to force agreement on word boundaries.

        Example: micro|soft| com|pany| B-ENT I-NAME I-ENT I-ENT will be rewritten with first strategy as microsoft|
        company| B-ENT I-ENT
        z;NONE and SIMPLE strategies are invalid for word aggregationNr   )r)   r.   r/   r   r   r   )r    rv   r?   Zword_entitiesZ
word_grouprg   r"   r"   r#   r     s"    z+TokenClassificationPipeline.aggregate_words)rv   r   c                 C   sj   |d d  dd }tdd |D }dd |D }|t|| j||d d |d d	 d
}|S )z
        Group together the adjacent tokens with the same entity predicted.

        Args:
            entities (`dict`): The entities predicted by the pipeline.
        r   rg   -re   c                 S   s   g | ]}|d  qS )r   r"   ri   r"   r"   r#   rk     s     zBTokenClassificationPipeline.group_sub_entities.<locals>.<listcomp>c                 S   s   g | ]}|d  qS r   r"   ri   r"   r"   r#   rk     s     rx   r~   )rh   r   r   rx   r~   )splitrm   r   ZmeanrG   r   )r    rv   rg   rt   tokensrh   r"   r"   r#   group_sub_entities  s    


z.TokenClassificationPipeline.group_sub_entities)entity_namer   c                 C   sH   | drd}|dd  }n$| dr8d}|dd  }nd}|}||fS )NzB-Br   zI-I)
startswith)r    r   bitagr"   r"   r#   get_tag  s    

z#TokenClassificationPipeline.get_tagc           	      C   s   g }g }|D ]n}|s | | q| |d \}}| |d d \}}||krd|dkrd| | q| | | |g}q|r| | | |S )z
        Find and group together the adjacent tokens with the same entity predicted.

        Args:
            entities (`dict`): The entities predicted by the pipeline.
        rg   re   r   )r   r   r   )	r    rv   Zentity_groupsZentity_group_disaggrg   r   r   Zlast_biZlast_tagr"   r"   r#   r     s    
z*TokenClassificationPipeline.group_entities)NNNNNN)N)!r%   r&   r'   r(   Zdefault_input_namesr   r7   r   boolr)   r   r   r   rK   r   r   r$   r]   rc   r.   rw   rs   rm   Zndarrayra   rp   rq   r   r   r   r   r   __classcell__r"   r"   r;   r#   r3   =   sF   $$      I 
&>r3   )r   rD   typingr   r   r   r   rl   rm   Zmodels.bert.tokenization_bertr   utilsr   r	   r
   r   baser   r   r   r   Z
tensorflowr5   Zmodels.auto.modeling_tf_autor   Zmodels.auto.modeling_autor   r   r)   r3   ZNerPipeliner"   r"   r"   r#   <module>   s,   
#   ^