U
    ,-epP                    @   sN  d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddl
mZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ee Z!dZ"eG dd deZ#eG dd deZ$G dd deZ%dZ&dZ'ee&G dd de%Z(ede&G dd de%Z)ede&G dd de%Z*dS ) zRAG model implementation.    N)	dataclass)CallableListOptionalTupleUnion)nn   )PretrainedConfig)BeamSearchScorerGenerationConfigLogitsProcessorListStoppingCriteriaList)ModelOutput)PreTrainedModel)%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )	RagConfig)RagRetrieverr   c                   @   s\  e Zd ZU dZdZeej ed< dZ	ejed< dZ
ejed< dZeeej  ed< dZeej ed< dZeej ed< dZeej ed	< dZeej ed
< dZeej ed< dZeeej  ed< dZeeej  ed< dZeej ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dS )RetrievAugLMMarginOutputa  
    Base class for retriever augmented marginalized models outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
            each vocabulary token.
        doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
            Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
            `question_encoder_last_hidden_state`.
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_heads, sequence_length, embed_size_per_head)`).

            Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
            (see `past_key_values` input) to speed up sequential decoding.
        retrieved_doc_embeds (`torch.FloatTensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
            Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to compute
            the `doc_scores`.
        retrieved_doc_ids (`torch.LongTensor` of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
            The indexes of the embedded documents retrieved by the retriever.
        context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
            Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
        context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
            Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever.
        question_encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
            model.
        question_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
        question_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
            average in the self-attention heads.
        generator_enc_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
        generator_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
        generator_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
            average in the self-attention heads.
        generator_dec_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
        generator_dec_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
            average in the self-attention heads.
        generator_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Cross-attentions weights of the generator decoder, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
    Nlosslogits
doc_scorespast_key_valuesretrieved_doc_embedsretrieved_doc_idscontext_input_idscontext_attention_mask"question_encoder_last_hidden_statequestion_enc_hidden_statesquestion_enc_attentionsgenerator_enc_last_hidden_stategenerator_enc_hidden_statesgenerator_enc_attentionsgenerator_dec_hidden_statesgenerator_dec_attentionsgenerator_cross_attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r   r   r   
LongTensorr   r   r    r!   r   r"   r#   r$   r%   r&   r'   r(    r1   r1   e/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/rag/modeling_rag.pyr   &   s$   
Jr   c                   @   sJ  e Zd ZU dZdZejed< dZejed< dZ	e
eej  ed< dZe
ej ed< dZe
ej ed< dZe
ej ed< dZe
ej ed	< dZe
ej ed
< dZe
eej  ed< dZe
eej  ed< dZe
ej ed< dZe
eej  ed< dZe
eej  ed< dZe
eej  ed< dZe
eej  ed< dZe
eej  ed< dS )RetrievAugLMOutputa;  
    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
            each vocabulary token.
        doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
            Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
            `question_encoder_last_hidden_state`.
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_heads, sequence_length, embed_size_per_head)`).

            Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
            (see `past_key_values` input) to speed up sequential decoding.
        retrieved_doc_embeds (`torch.FloatTensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
            Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to compute
            the `doc_scores`.
        retrieved_doc_ids (`torch.LongTensor` of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
            The indexes of the embedded documents retrieved by the retriever.
        context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
            Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
        context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
            Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever.
        question_encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
            model.
        question_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
        question_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
            average in the self-attention heads.
        generator_enc_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
        generator_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
        generator_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
            average in the self-attention heads.
        generator_dec_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
        generator_dec_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
            average in the self-attention heads.
        generator_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Cross-attentions weights of the generator decoder, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
    Nr   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   )r)   r*   r+   r,   r   r-   r.   r/   r   r   r   r   r   r   r0   r   r   r    r!   r   r"   r#   r$   r%   r&   r'   r(   r1   r1   r1   r2   r3      s"   
Fr3   c                       sF   e Zd ZdZeZdZe fddZed	e	e	e
edddZ  ZS )
RagPreTrainedModela  
    RAG models were released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP
    Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandra Piktus et al.

    RAG is a retriever augmented model and encapsulate three components: a question encoder, a dataset retriever and a
    generator, the encoder and generator are trainable while the retriever is just an indexed dataset.

    ragc                    s   d|d< t  j||S )NFZ
_fast_init)superfrom_pretrained)clsargskwargs	__class__r1   r2   r7      s    z"RagPreTrainedModel.from_pretrainedN).question_encoder_pretrained_model_name_or_path'generator_pretrained_model_name_or_path	retrieverreturnc                 K   sz  dd |  D }dd |  D }| D ]}|d| = q,| D ]}|d| = qD|dd}|dkr|dk	sxtdd	d
lm}	 d|krd	dlm}
 |
j|f|ddi\}}||d< |	j|f|}|dd}|dkr@|dk	stdd	dlm	} d|kr2d	dlm}
 |
j|f|ddi\}}||d< |j|f|}|
dd}|dkrjtj|j|jf|}| ||||dS )a5  
        Instantiates an question encoder and a generator from one or two base classes of the library from pretrained
        model checkpoints.

        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
        the model, you need to first set it back in training mode with `model.train()`.

        Params:
            question_encoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
                Information necessary to initiate the question encoder. Can be either:

                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                      user or organization name, like `dbmdz/bert-base-german-cased`.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

            generator_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
                Information necessary to initiate the generator. Can be either:

                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                      user or organization name, like `dbmdz/bert-base-german-cased`.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

            model_args (remaining positional arguments, *optional*):
                All remaining positional arguments will be passed to the underlying model's `__init__` method.
            retriever ([`RagRetriever`], *optional*):
                The retriever to use.
            kwwargs (remaining dictionary of keyword arguments, *optional*):
                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                `output_attentions=True`).

                - To update the question_encoder configuration, use the prefix *question_encoder_* for each
                  configuration parameter.
                - To update the generator configuration, use the prefix *generator_* for each configuration parameter.
                - To update the parent model configuration, do not use a prefix for each configuration parameter.

                Behaves differently depending on whether a `config` is provided or automatically loaded.

        Example:

        ```python
        >>> from transformers import RagModel

        >>> # initialize a RAG from two pretrained models.
        >>> model = RagModel.from_pretrained_question_encoder_generator(
        ...     "facebook/dpr-question_encoder-single-nq-base", "t5-small"
        ... )
        >>> # saving model after fine-tuning
        >>> model.save_pretrained("./rag")
        >>> # load fine-tuned model
        >>> model = RagModel.from_pretrained("./rag")
        ```c                 S   s,   i | ]$\}}| d r|td d |qS )question_encoder_N
startswithlen.0argumentvaluer1   r1   r2   
<dictcomp>:  s   
 zQRagPreTrainedModel.from_pretrained_question_encoder_generator.<locals>.<dictcomp>c                 S   s,   i | ]$\}}| d r|td d |qS )
generator_NrB   rE   r1   r1   r2   rI   @  s   
 rA   rJ   modelNznIf `model` is not defined as an argument, a `question_encoder_pretrained_model_name_or_path` has to be defined   	AutoModelconfig)
AutoConfigZreturn_unused_kwargsTzqIf `generator_model` is not defined as an argument, a `generator_pretrained_model_name_or_path` has to be definedAutoModelForSeq2SeqLM)question_encoder	generatorrO   r?   )itemskeyspopAssertionErrorauto.modeling_autorN   Zauto.configuration_autorP   r7   rR   getr   'from_question_encoder_generator_configsrO   )r8   r=   r>   r?   r:   Zkwargs_question_encoderZkwargs_generatorkeyrS   rN   rP   Zquestion_encoder_configrT   rR   Zgenerator_configrO   r1   r1   r2   *from_pretrained_question_encoder_generator   sz    H




 z=RagPreTrainedModel.from_pretrained_question_encoder_generator)NNN)r)   r*   r+   r,   r   config_classZbase_model_prefixclassmethodr7   strr   r   r]   __classcell__r1   r1   r;   r2   r4      s      r4   a  

    RAG is a seq2seq model which encapsulates two core components: a question encoder and a generator. During a forward
    pass, we encode the input with the question encoder and pass it to the retriever to extract relevant context
    documents. The documents are then prepended to the input. Such contextualized inputs is passed to the generator.

    The question encoder can be any *autoencoding* model, preferably [`DPRQuestionEncoder`], and the generator can be
    any *seq2seq* model, preferably [`BartForConditionalGeneration`].

    The model can be initialized with a [`RagRetriever`] for end-to-end generation or used in combination with the
    outputs of a retriever in multiple steps---see examples for more details. The model is compatible any
    *autoencoding* model as the `question_encoder` and any *seq2seq* model with language model head as the `generator`.
    It has been tested with [`DPRQuestionEncoder`] as the `question_encoder` and [`BartForConditionalGeneration`] or
    [`T5ForConditionalGeneration`] as the `generator`.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.


    Args:
        config ([`RagConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
        question_encoder ([`PreTrainedModel`]):
            An encoder model compatible with the faiss index encapsulated by the `retriever`.
        generator ([`PreTrainedModel`]):
            A seq2seq model used as the generator in the RAG architecture.
        retriever ([`RagRetriever`]):
            A retriever class encapsulating a faiss index queried to obtain context documents for current inputs.
aO  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize the model, specifies
            which generator to use, it also specifies a compatible generator tokenizer. Use that tokenizer class to
            obtain the indices.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*)
            Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
            *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape `(batch_size, n_docs *
            sequence_length, hidden_size)` is a sequence of hidden-states at the output of the last layer of the
            generator's encoder.

            Used by the ([`RagModel`]) model during decoding.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Provide for generation tasks. `None` by default, construct as per instructions for the generator model
            you're using with your RAG instance.
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        past_key_values (`tuple(tuple(torch.FloatTensor))`):
            Tuple consists of two elements: `encoder_outputs` of the RAG model (see `encoder_outputs`) and
            `past_key_values` of the underlying generator. Can be used to speed up decoding. `past_key_values` are used
            in the ([`RagTokenForGeneration`]) model during decoding.
        doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
            Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
            `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever` `doc_scores`
            has to be provided to the forward pass. `doc_scores` can be computed via
            `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more information.
        context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
            Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever. If the model was not initialized with a `retriever` ``context_input_ids` has to be provided to
            the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
        context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`,*optional*, returned when *output_retrieved=True*):
            Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever. If the model has is not initialized with a `retriever` `context_attention_mask` has to be
            provided to the forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
        use_cache (`bool`, *optional*, defaults to `True`):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        output_retrieved(`bool`, *optional*):
            Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
            `context_attention_mask`. See returned tensors for more detail.
        n_docs (`int`, *optional*, defaults to `config.n_docs``)
            Number of documents to retrieve and/or number of documents for which to generate an answer.
c                       s   e Zd Zd	ee ee ee ee d fddZee	e
eedd
eej eej eeeej   eej eej eeeej   eej eej eej ee ee ee ee ee eeej ef dddZ  ZS )RagModelNrO   rS   rT   r?   c                    s   |d k	s |d k	r|d k	s t d|d kr>tj|j|jf|}n"t|| js`t d| d| j t | |d krddlm	} |
|j}|d krddlm} |
|j}|| _| jd k	rt|tst dt| j d|| _|| _|| _d | _d	| _d S )
NzQEither a configuration or an question_encoder and a generator has to be provided.zconfig: z has to be of type rL   rM   rQ   z`self.retriever` is of type z&, but should be of type `RagRetriever`F)rX   r   r[   rO   
isinstancer^   r6   __init__rY   rN   from_configrS   rR   rT   r?   r   typectx_encodercontext_encoder_training)selfrO   rS   rT   r?   r:   rN   rR   r;   r1   r2   re     sF     "
 zRagModel.__init__output_typer^   )	input_idsattention_maskencoder_outputsdecoder_input_idsdecoder_attention_maskr   r   r   r   	use_cacheoutput_attentionsoutput_hidden_statesoutput_retrievedn_docsr@   c                 C   s4  |dk	r|n| j j}|
dk	r |
n| j j}
|dk	r4|n| j j}|dk	rH|n| j j}|dk	r\|n| j j}| jdk	o|dks|	dks|dko|dk}|dkr*|r| j||dd}|d }| j|| 	 
tj | jj j|dd}| jr|d |d |d	 |d
 |d |d f\}}	}}}}|
|}|	
|}	|
|}|
|}| j||ddj}|d||jd }t|d|ddd}nb|d |d |d	 |d f\}}	}}|
|}|
|}|	
|}	t|d|ddd}n6|dk	std|	dk	std|dk	s*td|dk	s<td|jd | dksltd| d|jd  d|dk	r|j|dd}|dk	r|j|dd}| j||	|||||
|dd	}|sd}d}d}d}d}n|j}|j}|r|sd}d}	d}d}t|j||j||	||||||j |j!|j"|j#|j$|j%dS )a  
        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, RagRetriever, RagModel
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
        >>> retriever = RagRetriever.from_pretrained(
        ...     "facebook/rag-token-base", index_name="exact", use_dummy_dataset=True
        ... )
        >>> # initialize with RagRetriever to do everything in one forward call
        >>> model = RagModel.from_pretrained("facebook/rag-token-base", retriever=retriever)

        >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
        >>> outputs = model(input_ids=inputs["input_ids"])
        ```NT)rn   return_dictr   ptprefixrv   Zreturn_tensorsr   r   r   Ztokenized_doc_idsZtokenized_doc_attention_maskZdoc_idsr   rL   zMake sure that `context_input_ids` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.zMake sure that `context_attention_mask` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.zMake sure that `doc_scores` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.z^Make sure that `doc_scores` are passed when passing `encoder_outputs` to the forward function.M The first dimension of `context_input_ids` should be a multiple of `n_docs`=	, but is .dim)	rm   rn   ro   rp   rq   r   rr   rs   rw   N)r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   )&rO   rv   rr   rs   rt   ru   r?   rS   cpudetachtor-   float32numpyrT   rz   ri   rh   Zpooler_outputviewshapebmm	unsqueeze	transposesqueezerX   repeat_interleavehidden_statesZ
attentionsr3   r   r   Zencoder_last_hidden_stateZencoder_hidden_statesZencoder_attentionsZdecoder_hidden_statesZdecoder_attentionsZcross_attentions)rj   rm   rn   ro   rp   rq   r   r   r   r   rr   rs   rt   ru   rv   Zhas_to_retrieveZquestion_enc_outputsr    Zretriever_outputsr   Zretrived_doc_input_idsZretrived_doc_attention_maskr   Zgen_outputsr!   r"   r1   r1   r2   forward  s   &

  	




     



 


zRagModel.forward)NNNN)NNNNNNNNNNNNNN)r)   r*   r+   r   r
   r   r   re   r   RAG_FORWARD_INPUTS_DOCSTRINGr   r3   _CONFIG_FOR_DOCr-   r0   Tensorr   r.   
BoolTensorboolintr   r   ra   r1   r1   r;   r2   rb     sV       *
              rb   zu
    A RAG-sequence model implementation. It performs RAG-sequence specific marginalization in the forward pass.
    c                       s  e Zd Zdee ee ee ee d fddZedddZedd	d
Z	e
eeeeddeej eej eeeej   eej eej eeeej   eej eej eej ee ee ee ee ee ee eej ee edddZedd Zedd Zedd Ze d eej eej eej eej eej ee ee ee ee ejd
ddZd!ddZedd Z   Z!S )"RagSequenceForGenerationNrc   c                    s^   |d k	s |d k	r|d k	s t d|d kr<tj|j|jf|}t | t||||d| _d S NzHEither a configuration or an encoder and a generator has to be provided.rc   rX   r   r[   rO   r6   re   rb   r5   rj   rO   rS   rT   r?   r:   r;   r1   r2   re     s      z!RagSequenceForGeneration.__init__r?   c                 C   s   || j _d S r   r5   r?   rj   r?   r1   r1   r2   set_retriever  s    z&RagSequenceForGeneration.set_retrieverrh   c                 C   s   d| j _|| j _d S NTr5   ri   rh   rj   rh   r1   r1   r2    set_context_encoder_for_training  s    z9RagSequenceForGeneration.set_context_encoder_for_trainingrk   )rm   rn   ro   rp   rq   r   r   r   r   rr   rs   rt   ru   exclude_bos_scorereduce_losslabelsrv   r@   c                 K   s   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}|dk	rT|dkrP|}d}
| j||||||||	||
||||d}d}|dk	r| j|j|j||| j j||d}t	||j|j|j
|j|j|j|j|j|j|j|j|j|j|j|j|jdS )a*	  
        exclude_bos_score (`bool`, *optional*):
            Only relevant if `labels` is passed. If `True`, the score of the BOS token is disregarded when computing
            the loss.
        reduce_loss (`bool`, *optional*):
            Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
            operation.
        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
             Legacy dictionary, which is required so that model can use *generate()* function.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, RagRetriever, RagSequenceForGeneration
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-sequence-nq")
        >>> retriever = RagRetriever.from_pretrained(
        ...     "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
        ... )
        >>> # initialize with RagRetriever to do everything in one forward call
        >>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

        >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
        >>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
        >>> input_ids = inputs["input_ids"]
        >>> labels = targets["input_ids"]
        >>> outputs = model(input_ids=input_ids, labels=labels)

        >>> # or use retriever separately
        >>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
        >>> # 1. Encode
        >>> question_hidden_states = model.question_encoder(input_ids)[0]
        >>> # 2. Retrieve
        >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
        >>> doc_scores = torch.bmm(
        ...     question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
        ... ).squeeze(1)
        >>> # 3. Forward to generator
        >>> outputs = model(
        ...     context_input_ids=docs_dict["context_input_ids"],
        ...     context_attention_mask=docs_dict["context_attention_mask"],
        ...     doc_scores=doc_scores,
        ...     decoder_input_ids=labels,
        ... )
        ```NFrm   rn   ro   rp   rq   r   r   r   r   rr   rs   rt   ru   rv   )r   epsilonr   rv   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   )rO   rv   r   r   r5   get_nllr   r   label_smoothingr   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   )rj   rm   rn   ro   rp   rq   r   r   r   r   rr   rs   rt   ru   r   r   r   rv   r:   outputsr   r1   r1   r2   r     sj    G
z RagSequenceForGeneration.forwardc                 C   s   | j jS r   r   rj   r1   r1   r2   r?   }  s    z"RagSequenceForGeneration.retrieverc                 C   s   | j jS r   r5   rT   r   r1   r1   r2   rT     s    z"RagSequenceForGeneration.generatorc                 C   s   | j jS r   r5   rS   r   r1   r1   r2   rS     s    z)RagSequenceForGeneration.question_encoder)
rm   rn   r   r   r   do_deduplicationnum_return_sequences	num_beamsrv   r@   c
                 K   s\  |	dk	r|	n| j j}	|dk	r |n| j j}|dk	r4|n| j j}|dk	rH|n| j j}|dk	sh|dk	shtd| jdk	r|dkr| j||dd }| j|| 	 
tj | jj j|	ddd }|
|}g }||
d< ||
d	< d|
d
< |dk	r|jd n|jd |	 }t|D ]8}|||	 |d |	  }| jj|f|
}|r\ttdd |D  }|jd }|dk	r|||d  |d}| ||dd}n|dk	std|dk	std||d}|||	 |d |	  }||d}|||d ddf }||d}| ||||dd}|d  |d }|||  q| j|| j jjdS )a  
        Implements RAG sequence "thorough" decoding. Read the [`~generation.GenerationMixin.generate`]` documentation
        for more information on how to set other generate input parameters.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                The sequence used as a prompt for the generation. If `input_ids` is not passed, then
                `context_input_ids` has to be provided.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
                Input IDs post-processed from the retrieved documents and the question encoder input_ids by the
                retriever.
            context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
                Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
                retriever.

                If the model is not initialized with a `retriever` or `input_ids` is not given, `context_input_ids` and
                `context_attention_mask` have to be provided to the forward pass. They are returned by
                [`~RagRetriever.__call__`].
            doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
                Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
                `question_encoder_last_hidden_state`.

                If the model is not initialized with a `retriever` or `input_ids` is not given, `doc_scores` has to be
                provided to the forward pass. `doc_scores` are returned by [`~RagRetriever.__call__`].
            do_deduplication (`bool`, *optional*):
                Whether or not to deduplicate the generations from different context documents for a given input. Has
                to be set to `False` if used while training with distributed backend.
            num_return_sequences(`int`, *optional*, defaults to 1):
                The number of independently computed returned sequences for each element in the batch. Note that this
                is not the value we pass to the `generator`'s `[`~generation.GenerationMixin.generate`]` function,
                where we set `num_return_sequences` to `num_beams`.
            num_beams (`int`, *optional*, defaults to 1):
                Number of beams for beam search. 1 means no beam search.
            n_docs (`int`, *optional*, defaults to `config.n_docs`)
                Number of documents to retrieve and/or number of documents for which to generate an answer.
            kwargs (`Dict[str, Any]`, *optional*):
                Additional kwargs will be passed to [`~generation.GenerationMixin.generate`].

        Return:
            `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
            sequences. The second dimension (sequence length) is either equal to `max_length` or shorter if all batches
            finished early due to the `eos_token_id`.
        Nz= At least one of input_ids or context_input_ids must be givenrn   r   rx   ry   r   r   r   rn   r   c                 S   s   i | ]}t | |qS r1   )r`   tolist)rF   kr1   r1   r2   rI     s     
 z5RagSequenceForGeneration.generate.<locals>.<dictcomp>T)r   r   zMake sure that `context_attention_mask` are passed, if no `input_ids` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.zMake sure that `doc_scores` are passed, if no `input_ids` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.)r   r   r   r   r   r   )pad_token_id)rO   rv   r   r   r   rX   r?   rS   r   r   r   r-   r   r   rT   rz   r   rangegeneratestacklistvaluesrepeatZtopkappend_cat_and_padr   )rj   rm   rn   r   r   r   r   r   r   rv   model_kwargsZnum_doc_return_sequencesquestion_hidden_statesZhypos
batch_sizeindexZgenerator_input_idsZoutput_sequencesZnum_candidatesZnew_input_idsr   Zindividual_input_idsZindividual_attention_maskZindividual_doc_scoresZtop_cand_indsr1   r1   r2   r     s    A	
 
 z!RagSequenceForGeneration.generateF        c                    sH  t d d dd f jd d jjjgd|d k	rF|n jj} jj	p^ jjj	}|d k	od d df 
| }	 fdd}
tjj|dd|jd | |d|d}tjj|dddd}|d d d d d dd d f }|d d d d ddd d f }|d d d d dd d d f }t j||| |gdd}ddd|dd | kst|jdd}|jdd	d
}|
||\}}|r|	r|d d d d dd f dn|d}|d}|d}|d}| }| }|r"| }| }||d }d| | ||  }|S )Nr   r   c                    sD     jjj}| r0| |d ||d | d|dfS Nr   r{   eqrO   rT   r   anyZmasked_fill_r   ll
smooth_objZpad_maskrj   targetr1   r2   
_mask_pads.  s
    z4RagSequenceForGeneration.get_nll.<locals>._mask_padsr{   r   rL   r   r   Tr   Zkeepdim      ?)r-   catnewr   fill_rO   rT   r   rv   bos_token_idr   allr   
functionallog_softmaxr   sizer   r   r   rX   gathersum	logsumexp)rj   
seq_logitsr   r   r   r   r   rv   r   Zuse_bosr   seq_logprobsdoc_logprobsZfirst_token_scoresZsecond_token_scores	remainderrag_logprobsr   r   nll_losssmooth_losseps_ir   r1   r   r2   r      sH    0 "      6


z RagSequenceForGeneration.get_nllc                 C   sv   | d  tdd | D tdd | D |}d}| D ]6}|||||jd  d |jd f< ||jd 7 }q:|S )Nr   c                 S   s   g | ]}|j d  qS )r   r   rF   tr1   r1   r2   
<listcomp>^  s     z9RagSequenceForGeneration._cat_and_pad.<locals>.<listcomp>c                 S   s   g | ]}|j d  qS )r   r   r   r1   r1   r2   r   ^  s     r   )r   r   maxr   r   )Ztensorsr   outputindr   r1   r1   r2   r   [  s    0$z%RagSequenceForGeneration._cat_and_pad)NNNN)NNNNNNNNNNNNNNNNN)	NNNNNNNNN)Fr   FN)"r)   r*   r+   r   r
   r   r   re   r   r   r   r   r   r   r   r-   r0   r   r   r   r.   r   r   r   propertyr?   rT   rS   no_gradr   r   staticmethodr   ra   r1   r1   r;   r2   r     s   	    
                 


                 
;r   zo
    A RAG-token model implementation. It performs RAG-token specific marginalization in the forward pass.
    c                       s  e Zd Zd*ee ee ee ee d fddZedddZedd	d
Z	d+ddZ
edd Zedd Zedd Zedd Zd,ddZeeeeedd-eej eej eeeej   eej eej eeeej   eej eej eej ee ee ee ee ee ee eej ee edddZe  dddddddde! e" f
eej eej eej eej eej ee ee# e$eejge%e f ee! ee" ejdddZ&dd Z'd d! Z(d"d# Z)d.d$d%Z*d/d(d)Z+  Z,S )0RagTokenForGenerationNrc   c                    s^   |d k	s |d k	r|d k	s t d|d kr<tj|j|jf|}t | t||||d| _d S r   r   r   r;   r1   r2   re   n  s      zRagTokenForGeneration.__init__r   c                 C   s   || j _d S r   r   r   r1   r1   r2   r     s    z#RagTokenForGeneration.set_retrieverr   c                 C   s   d| j _|| j _d S r   r   r   r1   r1   r2   r     s    z6RagTokenForGeneration.set_context_encoder_for_trainingc           	   
   K   s4   |d k	r|d d dd f }d ||||||d|d	S )Nr{   T)	rm   ro   r   r   rp   r   rr   do_marginalizerv   r1   )	rj   rp   r   rn   rr   ro   r   rv   r:   r1   r1   r2   prepare_inputs_for_generation  s    z3RagTokenForGeneration.prepare_inputs_for_generationc                 C   s   | j jS r   r   r   r1   r1   r2   r?     s    zRagTokenForGeneration.retrieverc                 C   s   | j jS r   r   r   r1   r1   r2   rT     s    zRagTokenForGeneration.generatorc                 C   s   | j jS r   r   r   r1   r1   r2   rS     s    z&RagTokenForGeneration.question_encoderc                    s8   dd  d}| D ]"}|t  fdd|D f7 }q|S )zeReorders cache for generation. BART-inspired but we need to take care of the extra dimension for docsc                 S   sX   | j d |j d  }| jd|f| j dd   } | d|} | jd| j dd   }|S )Nr   r{   r   rL   )r{   )r   r   Zindex_select)r   Z	new_orderrv   resultr1   r1   r2   _reorder_stacked  s
    z>RagTokenForGeneration._reorder_cache.<locals>._reorder_stackedr1   c                 3   s    | ]} | |jV  qd S r   )r   device)rF   Z
past_stater   beam_idxr1   r2   	<genexpr>  s     z7RagTokenForGeneration._reorder_cache.<locals>.<genexpr>)tuple)r   r   Zreordered_pastZ
layer_pastr1   r   r2   _reorder_cache  s    z$RagTokenForGeneration._reorder_cachec                 C   sp   |d k	r|n| j j}tjj|dd|jd | |d|d}tj|dd}||	d	d }tj
|ddS )Nr{   r   r   r   )rO   rv   r   r   r   r   r   r   r-   r   r   )rj   r   r   rv   r   r   Zlog_prob_sumr1   r1   r2   marginalize  s       z!RagTokenForGeneration.marginalizerk   )rm   rn   ro   rp   rq   r   r   r   r   rr   rs   rt   ru   r   r   r   rv   r@   c                 K   s  |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}|dk	rT|dkrP|}d}
| j||||||||	||
||||d}d}|j}|dk	r|dk	st| j|j|j||| j j	|d}|r| 
||j|}t|||j|j|j|j|j|j|j|j|j|j|j|j|j|j|jdS )ay
  
        do_marginalize (`bool`, *optional*):
            If `True`, the logits are marginalized over all documents by making use of
            `torch.nn.functional.log_softmax`.
        reduce_loss (`bool`, *optional*):
            Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
            operation.
        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
            Legacy dictionary, which is required so that model can use *generate()* function.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, RagRetriever, RagTokenForGeneration
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-nq")
        >>> retriever = RagRetriever.from_pretrained(
        ...     "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True
        ... )
        >>> # initialize with RagRetriever to do everything in one forward call
        >>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

        >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
        >>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
        >>> input_ids = inputs["input_ids"]
        >>> labels = targets["input_ids"]
        >>> outputs = model(input_ids=input_ids, labels=labels)

        >>> # or use retriever separately
        >>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
        >>> # 1. Encode
        >>> question_hidden_states = model.question_encoder(input_ids)[0]
        >>> # 2. Retrieve
        >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
        >>> doc_scores = torch.bmm(
        ...     question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
        ... ).squeeze(1)
        >>> # 3. Forward to generator
        >>> outputs = model(
        ...     context_input_ids=docs_dict["context_input_ids"],
        ...     context_attention_mask=docs_dict["context_attention_mask"],
        ...     doc_scores=doc_scores,
        ...     decoder_input_ids=labels,
        ... )

        >>> # or directly generate
        >>> generated = model.generate(
        ...     context_input_ids=docs_dict["context_input_ids"],
        ...     context_attention_mask=docs_dict["context_attention_mask"],
        ...     doc_scores=doc_scores,
        ... )
        >>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
        ```NFr   )r   r   rv   r   )rO   rv   r   r   r5   r   rX   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   )rj   rm   rn   ro   rp   rq   r   r   r   r   rr   rs   rt   ru   r   r   r   rv   r:   r   r   r   r1   r1   r2   r     sp    O	zRagTokenForGeneration.forward)rm   rn   r   r   r   rv   generation_configprefix_allowed_tokens_fnlogits_processorstopping_criteriar@   c              	      s  |dkr| j }t|}|jf |}dk	r0n| jj| jdk	r|dkr| j||dd }| j|| 	 
tj | jjjdd}|d |d |d   }}}|
|}|
|}|
|}t|d	|d	d
d	}|jd  dkstd d|jd  d|jd   | jj }|||dd}tj |j d	f|jtjt|  jd}|jd }|d }d" fdd	}|||jd}|||jd|d< |j|jdd}||d< ||d< ||d< |d< | j |||||	d}|jd	kr8|j!d	krt"d|j! d| j#|f||j$|j%|j&d|S |jd	kr|j!|jkrZt"dt' |j| j|j(|j)|j!|j$d }| j*||f||j$|j%|j&d|S t"d!|j dS )#a  
        Implements RAG token decoding.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                The sequence used as a prompt for the generation. If `input_ids` is not passed, then
                `context_input_ids` has to be provided.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
                Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
                retriever.

                If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
                forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
            context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
                Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
                retriever.

                If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
                forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
            doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
                Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
                `question_encoder_last_hidden_state`.

                If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
                forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
            n_docs (`int`, *optional*, defaults to `config.n_docs`)
                Number of documents to retrieve and/or number of documents for which to generate an answer.
            generation_config (`~generation.GenerationConfig`, *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, the default will be used, which has the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
                If provided, this function constraints the beam search to allowed tokens only at each step. If not
                provided no constraint is applied. This function takes 2 arguments `inputs_ids` and the batch ID
                `batch_id`. It has to return a list with the allowed tokens for the next generation step conditioned on
                the previously generated tokens `inputs_ids` and the batch ID `batch_id`. This argument is useful for
                constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                Retrieval](https://arxiv.org/abs/2010.00904).
            logits_processor (`LogitsProcessorList`, *optional*):
                Custom logits processors that complement the default logits processors built from arguments and a
                model's config. If a logit processor is passed that is already created with the arguments or a model's
                config an error is thrown.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                Custom stopping criteria that complement the default stopping criteria built from arguments and a
                model's config. If a stopping criteria is passed that is already created with the arguments or a
                model's config an error is thrown.
            kwargs (`Dict[str, Any]`, *optional*):
                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model.

        Return:
            `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
            sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches
            finished early due to the `eos_token_id`.
        Nr   r   rx   ry   r   r   r   r   rL   r|   r}   r~   T)rm   rn   rw   )Zdtyper   r{   last_hidden_statec                    sl   | d d d d f   df| jdd   } |  |f| jdd   } |   |  f| jdd   S )Nr   r	   )Zreshaper   expand)Ztensorr   r   rv   r1   r2   extend_enc_output  s    ,z9RagTokenForGeneration.generate.<locals>.extend_enc_output)r   r   r   ro   rn   rv   )r   input_ids_seq_lengthZencoder_input_idsr   r   z)num_return_sequences has to be 1, but is z when doing greedy search.)r   
max_lengthr   eos_token_idzA`num_return_sequences` has to be smaller or equal to `num_beams`.)r   r   r   length_penaltyZdo_early_stoppingZnum_beam_hyps_to_keepr  uH   `num_beams` has to be an integer strictly superior to 0 (≥ 1), but is )N)+r   copydeepcopyupdaterO   rv   r?   rS   r   r   r   r-   r   r   rT   rz   r   r   r   r   r   rX   r5   Zget_encoderfullr   decoder_start_token_idlongnext
parametersr   r   Z_get_logits_processorr   
ValueErrorZgreedy_searchr  r   r  r   r  Zearly_stoppingZbeam_search)rj   rm   rn   r   r   r   rv   r   r   r   r   r:   r   r   outr   encoderro   r  r   r   Zpre_processorZbeam_scorerr1   r   r2   r   ^  s    Q





	 
	

zRagTokenForGeneration.generatec                 C   s   | j j S r   )r5   rT   get_input_embeddingsr   r1   r1   r2   r  +  s    z*RagTokenForGeneration.get_input_embeddingsc                 C   s   | j j S r   )r5   rT   get_output_embeddingsr   r1   r1   r2   r  .  s    z+RagTokenForGeneration.get_output_embeddingsc                 C   s   | j j|S r   )r5   rT   set_output_embeddings)rj   Znew_embeddingsr1   r1   r2   r  1  s    z+RagTokenForGeneration.set_output_embeddingsc                 C   sX   |dkr| j j}||j}|ddddf  |ddddf< ||dddf< |S )zCShift input ids one token to the right, and pad with start_token_idNr{   r   r   )rO   r	  Z	new_zerosr   clone)rj   rm   Zstart_token_idZshifted_input_idsr1   r1   r2   shift_tokens_right4  s    (z(RagTokenForGeneration.shift_tokens_rightFr   c                    s  |d k	r|n j j}td d dd f jd d j jjgd fdd} 	|||}
d | kst|jdd}	|jddd}
||	|
\}	}
|	d}	|
d}
|	 }|
 }|r| }| }||d }d	| | ||  }|S )
Nr   r   c                    sD     jjj}| r0| |d ||d | d|dfS r   r   r   r   r1   r2   r   D  s
    z1RagTokenForGeneration.get_nll.<locals>._mask_padsr{   r   Tr   r   )rO   rv   r-   r   r   r   r   rT   r   r   r   r   rX   r   r   r   )rj   r   r   r   r   r   rv   r   r   r   r   r   r   r   r   r1   r   r2   r   =  s,    0 


zRagTokenForGeneration.get_nll)NNNN)NNNNNN)N)NNNNNNNNNNNNNNNNN)N)Fr   N)-r)   r*   r+   r   r
   r   r   re   r   r   r   r   r?   rT   rS   r   r   r   r   r   r   r   r   r-   r0   r.   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r   ra   r1   r1   r;   r2   r   g  s   	          






                   M
	r   )+r,   r  dataclassesr   typingr   r   r   r   r   r-   r   Zconfiguration_utilsr
   Z
generationr   r   r   r   Zmodeling_outputsr   Zmodeling_utilsr   utilsr   r   r   Zconfiguration_ragr   Zretrieval_ragr   Z
get_loggerr)   loggerr   r   r3   r4   ZRAG_START_DOCSTRINGr   rb   r   r   r1   r1   r1   r2   <module>   sL   
^Y &&> q   