U
    9%et                     @   sz  d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZ d dlZddlmZmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZ d	d
lmZmZm Z  e!e"Z#erddl$m%Z% ddl&m'Z' e rd dl(Z(e rd dl)Z*ddl+m,Z, dZ-e r&d dl.Z.d dl/m-Z- ddl0m1Z1 ej2ej2e3e3ej2e
dddZ4dddZ5G dd deZ6eeG dd de Z7dS )    N)Iterable)TYPE_CHECKINGDictListOptionalTupleUnion   )SquadExampleSquadFeatures"squad_convert_examples_to_features)	ModelCard)PreTrainedTokenizer)PaddingStrategyadd_end_docstringsis_tf_availableis_tokenizers_availableis_torch_availablelogging   )PIPELINE_INIT_ARGSArgumentHandlerChunkPipeline)TFPreTrainedModel)PreTrainedModel)-TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES)Dataset)*MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES)startendtopkmax_answer_lenundesired_tokensreturnc                 C   s  | j dkr| d } |j dkr$|d }tt| dt|d}tt||d }| }|dkrtt|g}nDt||k rt	| }n*t
| |d| }	|	t	||	   }t||jdd \}
}t|
| t|| @ }|
| }
|| }|d|
|f }|
||fS )aG  
    Take the output of any `ModelForQuestionAnswering` and will generate probabilities for each span to be the actual
    answer.

    In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
    answer end position being before the starting position. The method supports output the k-best answer through the
    topk argument.

    Args:
        start (`np.ndarray`): Individual start probabilities for each token.
        end (`np.ndarray`): Individual end probabilities for each token.
        topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
        max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
        undesired_tokens (`np.ndarray`): Mask determining tokens that can be part of the answer
    r   Nr   )ndimnpmatmulexpand_dimsZtrilZtriuflattenZargmaxlenZargsortZargpartitionZunravel_indexshapeisinnonzero)r   r   r    r!   r"   outer
candidatesZscores_flatZidx_sortidxstartsendsZdesired_spansscores r4   h/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/pipelines/question_answering.pydecode_spans.   s&    

 r6   @B F   c                 C   s   t t |d }|dk	r$||@ }|dk}	t |	d| } t |	d|}t | | jddd } | |   } t ||jddd }||  }|rt|| d |d   }d | d< |d< t	| ||||\}
}}|
|||fS )	ai  
    Takes the raw output of any `ModelForQuestionAnswering` and first normalizes its outputs and then uses
    `decode_spans()` to generate probabilities for each span to be the actual answer.

    Args:
        start (`np.ndarray`): Individual start logits for each token.
        end (`np.ndarray`): Individual end logits for each token.
        p_mask (`np.ndarray`): A mask with 1 for values that cannot be in the answer
        attention_mask (`np.ndarray`): The attention mask generated by the tokenizer
        min_null_score(`float`): The minimum null (empty) answer score seen so far.
        topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
        handle_impossible_answer(`bool`): Whether to allow null (empty) answers
        max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
    r   Ng        g     r$   T)ZaxisZkeepdims)r   r   )
r&   absarraywhereexpmaxsumminitemr6   )r   r   p_maskattention_maskmin_null_scoretop_khandle_impossible_answerr!   r"   Zundesired_tokens_maskr1   r2   r3   r4   r4   r5   select_starts_ends`   s    rF   c                   @   s    e Zd ZdZdd Zdd ZdS ) QuestionAnsweringArgumentHandlera&  
    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to
    internal [`SquadExample`].

    QuestionAnsweringArgumentHandler manages all the possible to create a [`SquadExample`] from the command-line
    supplied arguments.
    c                 C   s   t |tr|S t |trdD ]b}||kr2tdq|| d krPtd| dqt || trt|| dkrtd| dqtjf |S t| dd S )NquestioncontextzFYou need to provide a dictionary with keys {question:..., context:...}`z` cannot be Noner   z` cannot be emptyz2 argument needs to be of type (SquadExample, dict))	
isinstancer
   dictKeyError
ValueErrorstrr*   QuestionAnsweringPipelinecreate_sample)selfr@   kr4   r4   r5   	normalize   s    


z*QuestionAnsweringArgumentHandler.normalizec                    s  |d k	rlt |dkrlt |dkr*|d }n>t |dkr`dd |D thkr`|d |d dg}nt|}nd kr d }nd kr d }nd	 krzd
 krzt d	 trt d
 trڇ fdd d	 D }nt d	 tr:t d
 tr:t  d	 t  d
 krtddd t d	  d
 D }n>t d	 trpt d
 trp d	  d
 dg}ntdntd  td k	rtjtfntjf}t||r|S t|t	r|g}n$t|t
rt|}ntd  t|D ]\}}| |||< q|S )Nr   r   r	   c                 S   s   h | ]}t |qS r4   )type).0elr4   r4   r5   	<setcomp>   s     z<QuestionAnsweringArgumentHandler.__call__.<locals>.<setcomp>rH   XdatarI   rJ   c                    s   g | ]}| d  dqS )rJ   rH   r4   )rW   Qkwargsr4   r5   
<listcomp>   s     z=QuestionAnsweringArgumentHandler.__call__.<locals>.<listcomp>z2Questions and contexts don't have the same lengthsc                 S   s   g | ]\}}||d qS )rH   r4   )rW   r\   Cr4   r4   r5   r_      s     zArguments can't be understoodzUnknown arguments zInvalid arguments )r*   rP   listrL   rO   zipr   typesGeneratorTyperM   r   	enumeraterU   )rS   argsr^   inputsZgenerator_typesir@   r4   r]   r5   __call__   s@    
 
  

z)QuestionAnsweringArgumentHandler.__call__N)__name__
__module____qualname____doc__rU   ri   r4   r4   r4   r5   rG      s   rG   c                	       s   e Zd ZdZdZdZd#ed eee	 ee
 e
d fdd	Zeee
ee
 f ee
ee
 f eeee f d
ddZd$ddZ fddZd%ddZdd Zd&ddZdeeeeeeef dddZe
eeee
ee
ef f d d!d"Z  ZS )'rQ   a  
    Question Answering pipeline using any `ModelForQuestionAnswering`. See the [question answering
    examples](../task_summary#question-answering) for more information.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> oracle = pipeline(model="deepset/roberta-base-squad2")
    >>> oracle(question="Where do I live?", context="My name is Wolfgang and I live in Berlin")
    {'score': 0.9191, 'start': 34, 'end': 40, 'answer': 'Berlin'}
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This question answering pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"question-answering"`.

    The models that this pipeline can use are models that have been fine-tuned on a question answering task. See the
    up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=question-answering).
    zquestion,contextFN )r   r   model	tokenizer	modelcard	frameworktaskc                    sB   t  jf |||||d| t | _| | jdkr8tnt d S )Nro   tf)super__init__rG   _args_parserZcheck_model_typers   r   r   )rS   rp   rq   rr   rs   rt   r^   	__class__r4   r5   rw      s    		z"QuestionAnsweringPipeline.__init__)rI   rJ   r#   c                 C   s4   t | trdd t| |D S td| |dddS dS )aC  
        QuestionAnsweringPipeline leverages the [`SquadExample`] internally. This helper method encapsulate all the
        logic for converting question(s) and context(s) to [`SquadExample`].

        We currently support extractive question answering.

        Arguments:
            question (`str` or `List[str]`): The question(s) asked.
            context (`str` or `List[str]`): The context(s) in which we will look for the answer.

        Returns:
            One or a list of [`SquadExample`]: The corresponding [`SquadExample`] grouping question and context.
        c              	   S   s"   g | ]\}}t d ||d d d qS )N)r
   )rW   qcr4   r4   r5   r_   (  s     z;QuestionAnsweringPipeline.create_sample.<locals>.<listcomp>N)rL   ra   rb   r
   rH   r4   r4   r5   rR     s    
z'QuestionAnsweringPipeline.create_samplec
                 K   s   i }|d k	r||d< |d k	r$||d< |d k	r4||d< |d k	rD||d< i }|d k	rh|d krht dt |}|d k	r|dk rtd| d||d	< |d k	r|dk rtd
| |d k	r||d< |d k	r||d< |	d k	r|	|d< |i |fS )Npadding
doc_stridemax_question_lenmax_seq_lenz/topk parameter is deprecated, use top_k insteadr   z$top_k parameter should be >= 1 (got )rD   z-max_answer_len parameter should be >= 1 (got r!   rE   align_to_words)warningswarnUserWarningrO   )rS   r}   r    rD   r~   r!   r   r   rE   r   r^   Zpreprocess_paramsZpostprocess_paramsr4   r4   r5   _sanitize_parameters,  s6    z.QuestionAnsweringPipeline._sanitize_parametersc                    sJ   | j ||}t|ttfr:t|dkr:t j|d f|S t j|f|S )aX  
        Answer the question(s) given as inputs by using the context(s).

        Args:
            args ([`SquadExample`] or a list of [`SquadExample`]):
                One or several [`SquadExample`] containing the question and context.
            X ([`SquadExample`] or a list of [`SquadExample`], *optional*):
                One or several [`SquadExample`] containing the question and context (will be treated the same way as if
                passed as the first positional argument).
            data ([`SquadExample`] or a list of [`SquadExample`], *optional*):
                One or several [`SquadExample`] containing the question and context (will be treated the same way as if
                passed as the first positional argument).
            question (`str` or `List[str]`):
                One or several question(s) (must be used in conjunction with the `context` argument).
            context (`str` or `List[str]`):
                One or several context(s) associated with the question(s) (must be used in conjunction with the
                `question` argument).
            topk (`int`, *optional*, defaults to 1):
                The number of answers to return (will be chosen by order of likelihood). Note that we return less than
                topk answers if there are not enough options available within the context.
            doc_stride (`int`, *optional*, defaults to 128):
                If the context is too long to fit with the question for the model, it will be split in several chunks
                with some overlap. This argument controls the size of that overlap.
            max_answer_len (`int`, *optional*, defaults to 15):
                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
            max_seq_len (`int`, *optional*, defaults to 384):
                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
                model. The context will be split in several chunks (using `doc_stride` as overlap) if needed.
            max_question_len (`int`, *optional*, defaults to 64):
                The maximum length of the question after tokenization. It will be truncated if needed.
            handle_impossible_answer (`bool`, *optional*, defaults to `False`):
                Whether or not we accept impossible as an answer.
            align_to_words (`bool`, *optional*, defaults to `True`):
                Attempts to align the answer to real words. Improves quality on space separated langages. Might hurt on
                non-space-separated languages (like Japanese or Chinese)

        Return:
            A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:

            - **score** (`float`) -- The probability associated to the answer.
            - **start** (`int`) -- The character start index of the answer (in the tokenized version of the input).
            - **end** (`int`) -- The character end index of the answer (in the tokenized version of the input).
            - **answer** (`str`) -- The answer to the question.
        r   r   )rx   rL   ra   tupler*   rv   ri   )rS   rf   r^   examplesry   r4   r5   ri   W  s    0z"QuestionAnsweringPipeline.__call__
do_not_pad@   c                 #   s  t |tr$td |d |d d d d }|d kr:t| jjd}|d krPt|d d}||krntd| d| d| jjst|g| j|||t	j
d	d	d
}n>| jjdk| jr|jn|jr|jn|j|rdnd||ddddd
 t d } fddt|D }g }t|D ]}	 d |	 }
d kr8 d |	 nd }d krR d |	 nd }| jjd k	rtt|
| jjkd }|D ]}d||	 |< q||	 }|t|
||| |	 d i ddddg ddd	d d qt|D ]\}}i }i }| jjddg }|j D ]\}}||kr| jdkr^t|}|jtjkrLt|tj }t!|d||< n:| jdkrt"#|}|jt"j kr|$ }|%d||< n|||< q|t|d k}||d||V  qd S )NrI   rJ   i  r	      z`doc_stride` (z ) is larger than `max_seq_len` (r   F)r   rq   Zmax_seq_lengthr~   Zmax_query_lengthZpadding_strategyZis_trainingZtqdm_enabledrightZonly_secondZ
only_firstT)
textZ	text_pairr}   Z
truncation
max_lengthZstrideZreturn_token_type_idsZreturn_overflowing_tokensZreturn_offsets_mappingZreturn_special_tokens_mask	input_idsc                    s$   g | ]}fd d  |D qS )c                    s   g | ]} r|d kndqS )r   r   r4   )rW   tok)question_firstr4   r5   r_     s     zCQuestionAnsweringPipeline.preprocess.<locals>.<listcomp>.<listcomp>)Zsequence_ids)rW   Zspan_idZencoded_inputsr   r4   r5   r_     s   z8QuestionAnsweringPipeline.preprocess.<locals>.<listcomp>rB   token_type_idsr   )r   rB   r   rA   encoding	cls_indextoken_to_orig_mapZexample_index	unique_idZparagraph_lenZtoken_is_max_contexttokensZstart_positionZend_positionZis_impossibleZqas_idrA   ru   ptr   )exampleis_last)&rL   rM   r
   r?   rq   Zmodel_max_lengthrO   is_fastr   r   Z
MAX_LENGTHpadding_sideZquestion_textcontext_textr*   rangeZcls_token_idr&   r-   r:   appendr   re   model_input_names__dict__itemsrs   ru   ZconstantZdtypeZint64castZint32r(   torchtensorlongZ	unsqueeze)rS   r   r}   r~   r   r   featuresZ	num_spansrA   Zspan_idxZinput_ids_span_idxZattention_mask_span_idxZtoken_type_ids_span_idxZcls_indicesr   Zsubmaskrh   featureZfw_argsZothersr   rT   vr   r   r4   r   r5   
preprocess  s    





z$QuestionAnsweringPipeline.preprocessc                    s    d } fdd| j jD }| jdkr0| jjn| jj}dt|j	 krTd|d< | jf |}t
|tr|d |d |d	 S |d d
 \}}|||d	 S d S )Nr   c                    s   i | ]}| | qS r4   r4   )rW   rT   rg   r4   r5   
<dictcomp>  s      z6QuestionAnsweringPipeline._forward.<locals>.<dictcomp>r   Z	use_cacheFZstart_logitsZ
end_logits)r   r   r   r	   )rq   r   rs   rp   forwardcallinspect	signature
parameterskeysrL   rM   )rS   rg   r   Zmodel_inputsZmodel_forwardoutputr   r   r4   r   r5   _forward  s    
z"QuestionAnsweringPipeline._forwardr   r8   Tc                 C   s  d}g }|D ]}|d }	|d }
|d }|d }| dd d k	rN|d  nd }t|	|
||||||\}}}}| jjst|j}t|||D ]z\}}}|d }|	|
 t||| kd d 
 t||| kd d	 
 d
|j|| || d  d qqt| jjdk}|d }| jjdkrP|d | jjk  }nd}|r^dnd}t|||D ]T\}}}|| }|| }| |||||\}}|	|
 |||j|| d qnq|r|	|dddd t|dd ddd | }t|dkr|d S |S )Nr7   r   r   r   rA   rB   r   r   r$    r   )scorer   r   answerr   r   leftr   rn   c                 S   s   | d S )Nr   r4   )xr4   r4   r5   <lambda>_      z7QuestionAnsweringPipeline.postprocess.<locals>.<lambda>T)keyreverse)getnumpyrF   rq   r   r&   r:   Zchar_to_word_offsetrb   r   r@   r;   joinZ
doc_tokensboolr   Zpad_token_idr>   get_indicesr   sortedr*   )rS   Zmodel_outputsrD   rE   r!   r   rC   Zanswersr   Zstart_end_r   rA   rB   r1   r2   r3   Zchar_to_wordser   r   r   encoffsetsequence_indexstart_index	end_indexr4   r4   r5   postprocess  sj    
       

	z%QuestionAnsweringPipeline.postprocessztokenizers.Encoding)r   r   r   r   r   r#   c           
      C   s   |rtz<| |}| |}|j||dd }|j||dd }	W q tk
rp   |j| d }|j| d }	Y qX n|j| d }|j| d }	||	fS )N)r   r   r   )Ztoken_to_wordZword_to_chars	Exceptionoffsets)
rS   r   r   r   r   r   Z
start_wordZend_wordr   r   r4   r4   r5   r   d  s    

z%QuestionAnsweringPipeline.get_indices)r   r   r   r#   c                 C   s   g }d } } }}t |dD ]~\}	}
| j|
}||  krJ|krxn n*||krZ|}||krn|t|
 }||
g7 }||kr q|t|7 }|t|
d 7 }q"d|td|tt||dS )a  
        When decoding from token probabilities, this method maps token indexes to actual word in the initial context.

        Args:
            text (`str`): The actual context to extract the answer from.
            start (`int`): The answer starting token index.
            end (`int`): The answer end token index.

        Returns:
            Dictionary like `{'answer': str, 'start': int, 'end': int}`
        r   r   r   )r   r   r   )re   splitrq   tokenizer*   r   r=   r?   )rS   r   r   r   wordsZ	token_idxZchar_start_idxZchar_end_idxZ	chars_idxrh   wordtokenr4   r4   r5   span_to_answerv  s$    
z(QuestionAnsweringPipeline.span_to_answer)NNrn   )	NNNNNNNNN)r   Nr   N)r   Fr8   T)rj   rk   rl   rm   Zdefault_input_namesrE   r   r   r   r   rP   rw   staticmethodr   r
   rR   r   ri   r   r   r   intr   r   r   r   r   __classcell__r4   r4   ry   r5   rQ      sX                
+5
u    
V    
rQ   )r7   r   Fr8   )8r   rc   r   collections.abcr   typingr   r   r   r   r   r   r   r&   r[   r
   r   r   rr   r   Ztokenization_utilsr   utilsr   r   r   r   r   r   baser   r   r   Z
get_loggerrj   loggerZmodeling_tf_utilsr   Zmodeling_utilsr   Z
tokenizersZ
tensorflowru   Zmodels.auto.modeling_tf_autor   r   r   Ztorch.utils.dataZmodels.auto.modeling_autor   Zndarrayr   r6   rF   rG   rQ   r4   r4   r4   r5   <module>   sN     
    7    
6J