U
    9%e^o                     @   s  d Z ddlZddlZddlZddlmZmZmZmZ ddl	Z
ddlmZ ddlmZ ddlmZmZmZmZmZ dd	lmZ dd
lmZ e rddlmZmZmZ e rddlZeeZ dZ!G dd dZ"G dd de"Z#G dd de"Z$G dd de$Z%G dd de$Z&G dd dZ'dS )z#RAG Retriever model implementation.    N)IterableListOptionalTuple   )PreTrainedTokenizer)BatchEncoding)cached_fileis_datasets_availableis_faiss_availableloggingrequires_backends   )	RagConfig)RagTokenizer)Datasetload_datasetload_from_diskzAhttps://storage.googleapis.com/huggingface-nlp/datasets/wiki_dpr/c                   @   sV   e Zd ZdZejee dddZdeje	ejejf dddZ
d	d
 Zdd ZdS )IndexzL
    A base class for the Indices encapsulated by the [`RagRetriever`].
    doc_idsreturnc                 C   s   t dS )z
        Returns a list of dictionaries, containing titles and text of the retrieved documents.

        Args:
            doc_ids (`np.ndarray` of shape `(batch_size, n_docs)`):
                A tensor of document indices.
        NNotImplementedErrorselfr    r   d/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/rag/retrieval_rag.pyget_doc_dicts1   s    zIndex.get_doc_dicts   question_hidden_statesr   c                 C   s   t dS )a$  
        For each query in the batch, retrieves `n_docs` documents.

        Args:
            question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`):
                An array of query vectors.
            n_docs (`int`):
                The number of docs retrieved per query.

        Returns:
            `np.ndarray` of shape `(batch_size, n_docs)`: A tensor of indices of retrieved documents. `np.ndarray` of
            shape `(batch_size, vector_size)`: A tensor of vector representations of retrieved documents.
        Nr   )r   r!   n_docsr   r   r   get_top_docs;   s    zIndex.get_top_docsc                 C   s   t dS )zA
        Returns `True` if index is already initialized.
        Nr   r   r   r   r   is_initializedK   s    zIndex.is_initializedc                 C   s   t dS )a
  
        A function responsible for loading the index into memory. Should be called only once per training run of a RAG
        model. E.g. if the model is trained on multiple GPUs in a distributed setup, only one of the workers will load
        the index.
        Nr   r$   r   r   r   
init_indexQ   s    zIndex.init_indexN)r   )__name__
__module____qualname____doc__npndarrayr   dictr   r   r#   r%   r&   r   r   r   r   r   ,   s
   
 r   c                   @   sx   e Zd ZdZdZdZdd Zdd Zdd	 Zd
d Z	dd Z
dd ZejdddZdejeejejf dddZdS )LegacyIndexa  
    An index which can be deserialized from the files built using https://github.com/facebookresearch/DPR. We use
    default faiss index parameters as specified in that repository.

    Args:
        vector_size (`int`):
            The dimension of indexed vectors.
        index_path (`str`):
            A path to a *directory* containing index files compatible with [`~models.rag.retrieval_rag.LegacyIndex`]
    z,hf_bert_base.hnswSQ8_correct_phi_128.c_indexzpsgs_w100.tsv.pklc                 C   s,   g | _ || _|  | _|| _d | _d| _d S )NF)index_id_to_db_id
index_path_load_passagespassagesvector_sizeindex_index_initialized)r   r3   r0   r   r   r   __init__i   s    
zLegacyIndex.__init__c                 C   s   t j|}zt||}W n@ tk
rZ   d| d| d| d| d| d}t|Y nX |rrtd|  ntd| d|  |S )	NzCan't load 'z'. Make sure that:

- 'zB' is a correct remote path to a directory containing a file named z

- or 'z=' is the correct path to a directory containing a file named z.

zloading file z from cache at )ospathisdirr	   EnvironmentErrorloggerinfo)r   r0   filenameis_localZresolved_archive_filemsgr   r   r   _resolve_pathq   s    "zLegacyIndex._resolve_pathc              	   C   sF   t d| j  | | j| j}t|d}t|}W 5 Q R X |S )NLoading passages from rb)r;   r<   r0   r@   PASSAGE_FILENAMEopenpickleload)r   passages_pathZpassages_filer2   r   r   r   r1      s
    zLegacyIndex._load_passagesc              	   C   s   t d| j  | | j| jd }t|| _| | j| jd }t|d}t	
|| _W 5 Q R X t| j| jjkstdd S )NLoading index from z
.index.dprz.index_meta.dprrB   z<Deserialized index_id_to_db_id should match faiss index size)r;   r<   r0   r@   INDEX_FILENAMEfaissZ
read_indexr4   rD   rE   rF   r/   lenZntotalAssertionError)r   Zresolved_index_pathZresolved_meta_pathmetadata_filer   r   r   _deserialize_index   s    zLegacyIndex._deserialize_indexc                 C   s   | j S Nr5   r$   r   r   r   r%      s    zLegacyIndex.is_initializedc                 C   s:   t | jd d}d|j_d|j_|| _|   d| _d S )Nr   i         T)	rJ   ZIndexHNSWFlatr3   ZhnswZefSearchZefConstructionr4   rN   r5   )r   r4   r   r   r   r&      s    zLegacyIndex.init_index)r   c                    s|   g }|D ].}dd |D } fdd|D }| | qg }|D ]6}i }dd |D |d< dd |D |d< | | q@|S )Nc                 S   s   g | ]}t t|qS r   )strint.0Zdoc_idr   r   r   
<listcomp>   s     z-LegacyIndex.get_doc_dicts.<locals>.<listcomp>c                    s   g | ]} j | qS r   )r2   rU   r$   r   r   rW      s     c                 S   s   g | ]}|d  qS )r   r   rV   docr   r   r   rW      s     titlec                 S   s   g | ]}|d  qS r   r   rX   r   r   r   rW      s     text)append)r   r   Zdoc_listZ	doc_ids_iidsdocsZ	doc_dictsZdoc_dictr   r$   r   r      s    zLegacyIndex.get_doc_dictsr   r    c           	         sr   t jt|dddd}t ||f} j||\}} fdd|D } fdd|D }t |t |fS )Nfloat32)dtyper   c                    s   g | ]} fd d|D qS )c                    s$   g | ]} j t|d d qS )Nrb   )r4   ZreconstructrT   rU   r$   r   r   rW      s     7LegacyIndex.get_top_docs.<locals>.<listcomp>.<listcomp>r   rV   r   r$   r   r   rW      s     z,LegacyIndex.get_top_docs.<locals>.<listcomp>c                    s   g | ]} fd d|D qS )c                    s   g | ]}t  j| qS r   )rT   r/   rU   r$   r   r   rW      s     rc   r   rd   r$   r   r   rW      s     )r+   zerosrK   ZreshapeZhstackr4   searcharray)	r   r!   r"   Zaux_dimZquery_nhsw_vectors_Zdocs_idsvectorsr^   r   r$   r   r#      s    zLegacyIndex.get_top_docsN)r   )r'   r(   r)   r*   rI   rC   r6   r@   r1   rN   r%   r&   r+   rg   r   r,   r   r#   r   r   r   r   r.   Z   s   r.   c                   @   sj   e Zd ZdddZedddZdd Zd	d
 Zej	e
e dddZdej	eej	ej	f dddZdS )HFIndexBaseFc                 C   s6   || _ || _|| _| j|d |jddgddd d S )N
with_indexnumpy
embeddingsTr`   )columnsoutput_all_columnsra   )r3   datasetr5   _check_dataset_format
set_format)r   r3   rq   index_initializedr   r   r   r6      s
    zHFIndexBase.__init__rk   c                 C   sn   t | jts tdt| j tdddht| jj dkrPtd| jj |rjd| j krjtdd S )Nz5Dataset should be a datasets.Dataset object, but got rZ   r\   rn   r   zDataset should be a dataset with the following columns: title (str), text (str) and embeddings (arrays of dimension vector_size), but got columns zMissing faiss index in the dataset. Make sure you called `dataset.add_faiss_index` to compute it or `dataset.load_faiss_index` to load one from the disk.)	
isinstancerq   r   
ValueErrortyperK   setZcolumn_namesZlist_indexes)r   rl   r   r   r   rr      s    z!HFIndexBase._check_dataset_formatc                 C   s
   t  d S rO   r   r$   r   r   r   r&      s    zHFIndexBase.init_indexc                 C   s   | j S rO   rP   r$   r   r   r   r%      s    zHFIndexBase.is_initializedr   c                    s    fddt  jd D S )Nc                    s   g | ]}j  |   qS r   )rq   tolistrV   ir   r   r   r   rW      s     z-HFIndexBase.get_doc_dicts.<locals>.<listcomp>r   )rangeshaper   r   r|   r   r      s    zHFIndexBase.get_doc_dictsr   r    c              
      s    j d||\}} fdd|D }dd |D }tt|D ]B}t|| |k r@t|| t|t||   jfg||< q@t|t|fS )Nrn   c                    s    g | ]} j d d |D  qS )c                 S   s   g | ]}|d kr|qS r[   r   rz   r   r   r   rW      s      z7HFIndexBase.get_top_docs.<locals>.<listcomp>.<listcomp>)rq   )rV   indicesr$   r   r   rW      s     z,HFIndexBase.get_top_docs.<locals>.<listcomp>c                 S   s   g | ]}|d  qS )rn   r   rX   r   r   r   rW      s     )	rq   Zsearch_batchr}   rK   r+   Zvstackre   r3   rg   )r   r!   r"   rh   r^   r_   ri   r{   r   r$   r   r#      s    0zHFIndexBase.get_top_docsN)F)r   )r'   r(   r)   r6   boolrr   r&   r%   r+   r,   r   r-   r   r   r#   r   r   r   r   rj      s   
rj   c                       s@   e Zd ZdZdeeeee ee d fddZd	d
 Z  Z	S )CanonicalHFIndexa  
    A wrapper around an instance of [`~datasets.Datasets`]. If `index_path` is set to `None`, we load the pre-computed
    index available with the [`~datasets.arrow_dataset.Dataset`], otherwise, we load the index from the indicated path
    on disk.

    Args:
        vector_size (`int`): the dimension of the passages embeddings used by the index
        dataset_name (`str`, optional, defaults to `wiki_dpr`):
            A dataset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
            with `datasets.list_datasets()`).
        dataset_split (`str`, optional, defaults to `train`)
            Which split of the `dataset` to load.
        index_name (`str`, optional, defaults to `train`)
            The index_name of the index associated with the `dataset`. The index loaded from `index_path` will be saved
            under this name.
        index_path (`str`, optional, defaults to `None`)
            The path to the serialized faiss index on disk.
        use_dummy_dataset (`bool`, optional, defaults to `False`):
            If True, use the dummy configuration of the dataset for tests.
    wiki_dprtrainNF)r3   dataset_namedataset_split
index_namer0   c                    s   t |d kt |d k dkr$td|| _|| _|| _|| _|| _td| j  t	| jd| j| jd}t
 j||dd d S )Nr   z,Please provide `index_name` or `index_path`.rA   F)rl   splitdummyrt   )rT   rv   r   r   r   r0   use_dummy_datasetr;   r<   r   superr6   )r   r3   r   r   r   r0   r   rq   	__class__r   r   r6      s    	   zCanonicalHFIndex.__init__c                 C   s   | j d k	r0td| j   | jjd| j d nLtd| j d| j  t| jdd| j| j| j	d| _| jj
ddgdd d| _d S )	NrH   rn   filez with index name T)Zwith_embeddingsrl   r   r   r   rm   )ro   rp   )r0   r;   r<   rq   load_faiss_indexr   r   r   r   r   rs   r5   r$   r   r   r   r&     s    
zCanonicalHFIndex.init_index)r   r   NNF)
r'   r(   r)   r*   rT   rS   r   r6   r&   __classcell__r   r   r   r   r      s        r   c                       s<   e Zd ZdZd
ed fddZedd Zdd	 Z  Z	S )CustomHFIndexa  
    A wrapper around an instance of [`~datasets.Datasets`]. The dataset and the index are both loaded from the
    indicated paths on disk.

    Args:
        vector_size (`int`): the dimension of the passages embeddings used by the index
        dataset_path (`str`):
            The path to the serialized dataset on disk. The dataset should have 3 columns: title (str), text (str) and
            embeddings (arrays of dimension vector_size)
        index_path (`str`)
            The path to the serialized faiss index on disk.
    N)r3   c                    s    t  j|||d kd || _d S )Nr   )r   r6   r0   )r   r3   rq   r0   r   r   r   r6   -  s    zCustomHFIndex.__init__c                 C   s>   t d|  |d ks |d kr(tdt|}| |||dS )NrA   zPlease provide `dataset_path` and `index_path` after calling `dataset.save_to_disk(dataset_path)` and `dataset.get_index('embeddings').save(index_path)`.)r3   rq   r0   )r;   r<   rv   r   )clsr3   dataset_pathr0   rq   r   r   r   r   1  s    zCustomHFIndex.load_from_diskc                 C   s6   |   s2td| j  | jjd| jd d| _d S )NrH   rn   r   T)r%   r;   r<   r0   rq   r   r5   r$   r   r   r   r&   <  s    zCustomHFIndex.init_index)N)
r'   r(   r)   r*   rT   r6   classmethodr   r&   r   r   r   r   r   r     s
   

r   c                       s   e Zd ZdZd fdd	Zedd Zeddd	Zd
d Z	dd Z
d ddZeeee dddZejeeejejf dddZejeeejee f dddZedddZd!eee  ejedddZ  ZS )"RagRetrievera  
    Retriever used to get documents from vector queries. It retrieves the documents embeddings as well as the documents
    contents, and it formats them to be used with a RagModel.

    Args:
        config ([`RagConfig`]):
            The configuration of the RAG model this Retriever is used with. Contains parameters indicating which
            `Index` to build. You can load your own custom dataset with `config.index_name="custom"` or use a canonical
            one (default) from the datasets library with `config.index_name="wiki_dpr"` for example.
        question_encoder_tokenizer ([`PreTrainedTokenizer`]):
            The tokenizer that was used to tokenize the question. It is used to decode the question and then use the
            generator_tokenizer.
        generator_tokenizer ([`PreTrainedTokenizer`]):
            The tokenizer used for the generator part of the RagModel.
        index ([`~models.rag.retrieval_rag.Index`], optional, defaults to the one defined by the configuration):
            If specified, use this index instead of the one built using the configuration

    Examples:

    ```python
    >>> # To load the default "wiki_dpr" dataset with 21M passages from wikipedia (index name is 'compressed' or 'exact')
    >>> from transformers import RagRetriever

    >>> retriever = RagRetriever.from_pretrained(
    ...     "facebook/dpr-ctx_encoder-single-nq-base", dataset="wiki_dpr", index_name="compressed"
    ... )

    >>> # To load your own indexed dataset built with the datasets library. More info on how to build the indexed dataset in examples/rag/use_own_knowledge_dataset.py
    >>> from transformers import RagRetriever

    >>> dataset = (
    ...     ...
    ... )  # dataset must be a datasets.Datasets object with columns "title", "text" and "embeddings", and it must have a faiss index
    >>> retriever = RagRetriever.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base", indexed_dataset=dataset)

    >>> # To load your own indexed dataset built with the datasets library that was saved on disk. More info in examples/rag/use_own_knowledge_dataset.py
    >>> from transformers import RagRetriever

    >>> dataset_path = "path/to/my/dataset"  # dataset saved via *dataset.save_to_disk(...)*
    >>> index_path = "path/to/my/index.faiss"  # faiss index saved via *dataset.get_index("embeddings").save(...)*
    >>> retriever = RagRetriever.from_pretrained(
    ...     "facebook/dpr-ctx_encoder-single-nq-base",
    ...     index_name="custom",
    ...     passages_path=dataset_path,
    ...     index_path=index_path,
    ... )

    >>> # To load the legacy index built originally for Rag's paper
    >>> from transformers import RagRetriever

    >>> retriever = RagRetriever.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base", index_name="legacy")
    ```NTc                    sn   || _ t| ddg t   |p*| || _|| _|| _|j| _|j	| _
|| _| j r^|   d | _d| _d S )NdatasetsrJ   F)Z_init_retrievalr   r   r6   _build_indexr4   generator_tokenizerquestion_encoder_tokenizerr"   Zretrieval_batch_size
batch_sizeconfiginit_retrievalctx_encoder_tokenizerreturn_tokenized_docs)r   r   r   r   r4   r   r   r   r   r6   y  s    
zRagRetriever.__init__c                 C   s`   | j dkrt| j| jptS | j dkr<tj| j| j| jdS t| j| j	| j
| j | j| jdS d S )Nlegacycustom)r3   r   r0   )r3   r   r   r   r0   r   )r   r.   retrieval_vector_sizer0   LEGACY_INDEX_PATHr   r   rG   r   rq   r   r   r   r   r   r   r     s&    

zRagRetriever._build_indexc           	      K   sx   t | ddg |dd p&tj|f|}tj||d}|j}|j}|d k	r^d|_t|j	|}n
| 
|}| ||||dS )Nr   rJ   r   r   r   )r   r   r4   )r   popr   from_pretrainedr   question_encoder	generatorr   r   r   r   )	r   Zretriever_name_or_pathZindexed_datasetkwargsr   rag_tokenizerr   r   r4   r   r   r   r     s    
zRagRetriever.from_pretrainedc                 C   s   t | jtr| jjd krBtj|d}| jj	d
| || j_| jjd krtj|d}| jjjd}| jj| || jjjd< || j_| j| t| j| jd}|| d S )Nzhf_dataset_index.faissrn   Z
hf_dataset)r   r   )ru   r4   r   r   r0   r7   r8   joinrq   	get_indexsaverG   Z_indexesr   Zsave_to_disksave_pretrainedr   r   r   )r   Zsave_directoryr0   rG   Zfaiss_indexr   r   r   r   r     s"    zRagRetriever.save_pretrainedc                 C   s   t d | j  dS )zT
        Retriever initialization function. It loads the index into memory.
        zinitializing retrievalN)r;   r<   r4   r&   r$   r   r   r   r     s    
zRagRetriever.init_retrievalc                    sX   fdd  fddt tD }jj|jj|ddd}|d |d	 fS )
a%  
        Postprocessing retrieved `docs` and combining them with `input_strings`.

        Args:
            docs  (`dict`):
                Retrieved documents.
            input_strings (`str`):
                Input strings decoded by `preprocess_query`.
            prefix (`str`):
                Prefix added at the beginning of each input, typically used with T5-based models.

        Return:
            `tuple(tensors)`: a tuple consisting of two elements: contextualized `input_ids` and a compatible
            `attention_mask`.
        c                    sd   |  dr| dd  } | dr,| d d } |d kr8d}||   jj |  jj | dd}|S )N"r   rb    z   )
startswithendswithr   Z	title_sepZdoc_sepreplace)Z	doc_titleZdoc_textZinput_stringprefixoutr$   r   r   cat_input_and_doc  s    

  z8RagRetriever.postprocess_docs.<locals>.cat_input_and_docc                    sD   g | ]<}t D ].} | d  | | d | | qqS )rZ   r\   )r}   )rV   r{   j)r   r_   input_stringsr"   r   r   r   rW     s   
 z1RagRetriever.postprocess_docs.<locals>.<listcomp>
max_lengthT)r   return_tensorspadding
truncation	input_idsattention_mask)r}   rK   r   Zbatch_encode_plusr   Zmax_combined_length)r   r_   r   r   r"   r   Zrag_input_stringsZcontextualized_inputsr   )r   r_   r   r"   r   r   r   postprocess_docs  s    
zRagRetriever.postprocess_docs)t
chunk_sizer   c                    s     fddt dt D S )Nc                    s   g | ]}||   qS r   r   rz   r   r   r   r   rW     s     z.RagRetriever._chunk_tensor.<locals>.<listcomp>r   )r}   rK   )r   r   r   r   r   r   _chunk_tensor  s    zRagRetriever._chunk_tensor)r!   r"   r   c           	      C   s   |  || j}g }g }|D ]R}t }| j||\}}tdt |  d|j  || || qt	
|t	
|fS )Nzindex search time: z sec, batch size )r   r   timer4   r#   r;   debugr~   extendr+   rg   )	r   r!   r"   Zquestion_hidden_states_batchedZids_batchedZvectors_batched
start_timer^   ri   r   r   r   _main_retrieve  s    
zRagRetriever._main_retrievec                 C   s"   |  ||\}}||| j|fS )a%  
        Retrieves documents for specified `question_hidden_states`.

        Args:
            question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`):
                A batch of query vectors to retrieve with.
            n_docs (`int`):
                The number of docs retrieved per query.

        Return:
            `Tuple[np.ndarray, np.ndarray, List[dict]]`: A tuple with the following objects:

            - **retrieved_doc_embeds** (`np.ndarray` of shape `(batch_size, n_docs, dim)`) -- The retrieval embeddings
              of the retrieved docs per query.
            - **doc_ids** (`np.ndarray` of shape `(batch_size, n_docs)`) -- The ids of the documents in the index
            - **doc_dicts** (`List[dict]`): The `retrieved_doc_embeds` examples per query.
        )r   r4   r   )r   r!   r"   r   retrieved_doc_embedsr   r   r   retrieve  s    zRagRetriever.retrieve)r   c                 C   s   || _ d| _d S )NT)r   r   )r   r   r   r   r   set_ctx_encoder_tokenizer/  s    z&RagRetriever.set_ctx_encoder_tokenizer)question_input_idsr!   r   c                 C   s  |dk	r|n| j }|dk	r|n| jjj}| ||\}}}| jj|dd}	| j||	|||d\}
}| jrg }g }t	t
|D ]>}t	|D ]0}||| d |  ||| d |  qq|| j||dd|d}t|
||||d	 |d
 d|dS t|
|||d|dS dS )a#  
        Retrieves documents for specified `question_hidden_states`.

        Args:
            question_input_ids (`List[List[int]]`) batch of input ids
            question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`:
                A batch of query vectors to retrieve with.
            prefix (`str`, *optional*):
                The prefix used by the generator's tokenizer.
            n_docs (`int`, *optional*):
                The number of docs retrieved per query.
            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to "pt"):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.

        Returns: [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:

            - **context_input_ids** -- List of token ids to be fed to a model.

              [What are input IDs?](../glossary#input-ids)

            - **context_attention_mask** -- List of indices specifying which tokens should be attended to by the model
            (when `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).

              [What are attention masks?](../glossary#attention-mask)

            - **retrieved_doc_embeds** -- List of embeddings of the retrieved documents
            - **doc_ids** -- List of ids of the retrieved documents
        NT)Zskip_special_tokens)r   r\   rZ   longest)r   r   r   r   r   )context_input_idscontext_attention_maskr   r   Ztokenized_doc_idsZtokenized_doc_attention_mask)Ztensor_type)r   r   r   r   )r"   r   r   r   r   r   Zbatch_decoder   r   r}   rK   r]   r   r   )r   r   r!   r   r"   r   r   r   r_   r   r   r   Zretrieved_doc_textZretrieved_doc_titleZb_idxZdoc_idxZtokenized_docsr   r   r   __call__4  sV    )    
zRagRetriever.__call__)NT)N)N)NNN)r'   r(   r)   r*   r6   staticmethodr   r   r   r   r   r   r   rT   r   r   r+   r,   r   r   r-   r   r   r   r   r   r   r   r   r   r   r   C  s*   5

4 "	   
r   )(r*   r7   rE   r   typingr   r   r   r   rm   r+   Ztokenization_utilsr   Ztokenization_utils_baser   utilsr	   r
   r   r   r   Zconfiguration_ragr   Ztokenization_ragr   r   r   r   r   rJ   Z
get_loggerr'   r;   r   r   r.   rj   r   r   r   r   r   r   r   <module>   s,   
.]*>$