U
    9%e                     @   s  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZmZ ddlmZmZmZmZ ddlm Z  e!e"Z#dZ$dZ%ddddgZ&G dd dej'Z(G dd dej'Z)G dd dej'Z*G dd dej'Z+G dd dej'Z,G dd dej'Z-G d d! d!ej'Z.G d"d# d#ej'Z/G d$d% d%eZ0d&Z1d'Z2ed(e1G d)d* d*e0Z3G d+d, d,ej'Z4G d-d. d.ej'Z5ed/e1G d0d1 d1e0Z6eG d2d3 d3eZ7ed4e1G d5d6 d6e0Z8dS )7z PyTorch Splinter model.    N)	dataclass)ListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN))BaseModelOutputWithPastAndCrossAttentionsModelOutputQuestionAnsweringModelOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )SplinterConfigztau/splinter-baser   ztau/splinter-base-qassztau/splinter-largeztau/splinter-large-qassc                       sV   e Zd ZdZ fddZd	eej eej eej eej ee	 e
dddZ  ZS )
SplinterEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _| jdt|jddd t|dd| _d S )	N)padding_idxZepsposition_ids)r   F)
persistentposition_embedding_typeabsolute)super__init__r   	EmbeddingZ
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingsZtype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutZregister_buffertorcharangeexpandgetattrr   selfconfig	__class__ m/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/splinter/modeling_splinter.pyr!   4   s    
  zSplinterEmbeddings.__init__Nr   )	input_idstoken_type_idsr   inputs_embedspast_key_values_lengthreturnc                 C   s   |d k	r|  }n|  d d }|d }|d krL| jd d ||| f }|d krjtj|tj| jjd}|d kr|| |}| |}|| }	| jdkr| 	|}
|	|
7 }	| 
|	}	| |	}	|	S )Nr   r   dtypedevicer   )sizer   r.   zeroslongr@   r%   r(   r   r'   r)   r-   )r3   r9   r:   r   r;   r<   input_shape
seq_lengthr(   
embeddingsr'   r7   r7   r8   forwardE   s$    






zSplinterEmbeddings.forward)NNNNr   )__name__
__module____qualname____doc__r!   r   r.   
LongTensorFloatTensorintr   rG   __classcell__r7   r7   r5   r8   r   1   s        r   c                
       s   e Zd Zd fdd	ZejejdddZdejeej eej eej eej ee	e	ej   ee
 e	ej dd	d
Z  ZS )SplinterSelfAttentionNc                    s   t    |j|j dkr>t|ds>td|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _|pt|dd| _| jdks| jd	kr|j| _t	d
|j d | j| _|j| _d S )Nr   Zembedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   relative_keyrelative_key_query   r   )r    r!   r#   num_attention_headshasattr
ValueErrorrN   attention_head_sizeall_head_sizer   Linearquerykeyvaluer+   Zattention_probs_dropout_probr-   r1   r   r&   r"   distance_embedding
is_decoderr3   r4   r   r5   r7   r8   r!   i   s*    
  zSplinterSelfAttention.__init__)xr=   c                 C   s6   |  d d | j| jf }||}|ddddS )Nr   r   rT   r   r	   )rA   rU   rX   viewpermute)r3   ra   Znew_x_shaper7   r7   r8   transpose_for_scores   s    
z*SplinterSelfAttention.transpose_for_scoresFhidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsr=   c                 C   s  |  |}|d k	}	|	r4|d k	r4|d }
|d }|}n|	r^| | |}
| | |}|}nv|d k	r| | |}
| | |}tj|d |
gdd}
tj|d |gdd}n | | |}
| | |}| |}|d k	}| jr|
|f}t||
dd}| j	dks | j	dkr|j
d |
j
d  }}|r^tj|d tj|jd	dd}ntj|tj|jd	dd}tj|tj|jd	dd}|| }| || j d }|j|jd
}| j	dkrtd||}|| }n4| j	dkrtd||}td|
|}|| | }|t| j }|d k	r:|| }tjj|dd}| |}|d k	rf|| }t||}|dddd }| d d | jf }||}|r||fn|f}| jr||f }|S )Nr   r   rT   dimr   rR   rS   r>   )r?   zbhld,lrd->bhlrzbhrd,lrd->bhlrr	   ) r[   rd   r\   r]   r.   catr_   matmulZ	transposer   shapeZtensorrC   r@   rb   r/   r^   r&   tor?   ZeinsummathsqrtrX   r   Z
functionalZsoftmaxr-   rc   
contiguousrA   rY   )r3   rf   rg   rh   ri   rj   rk   rl   Zmixed_query_layerZis_cross_attentionZ	key_layerZvalue_layerZquery_layer	use_cacheZattention_scoresZquery_lengthZ
key_lengthZposition_ids_lZposition_ids_rZdistanceZpositional_embeddingZrelative_position_scoresZrelative_position_scores_queryZrelative_position_scores_keyZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr7   r7   r8   rG      sp    


 





zSplinterSelfAttention.forward)N)NNNNNF)rH   rI   rJ   r!   r.   Tensorrd   r   rM   r   boolrG   rO   r7   r7   r5   r8   rP   h   s$         rP   c                       s4   e Zd Z fddZejejejdddZ  ZS )SplinterSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr   )r    r!   r   rZ   r#   denser)   r*   r+   r,   r-   r2   r5   r7   r8   r!      s    
zSplinterSelfOutput.__init__rf   input_tensorr=   c                 C   s&   |  |}| |}| || }|S Nr}   r-   r)   r3   rf   r   r7   r7   r8   rG      s    

zSplinterSelfOutput.forwardrH   rI   rJ   r!   r.   ry   rG   rO   r7   r7   r5   r8   r{      s   r{   c                
       sv   e Zd Zd
 fdd	Zdd Zdejeej eej eej eej ee	e	ej   ee
 e	ej ddd	Z  ZS )SplinterAttentionNc                    s.   t    t||d| _t|| _t | _d S )Nr   )r    r!   rP   r3   r{   outputsetpruned_headsr`   r5   r7   r8   r!      s    

zSplinterAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rm   )lenr   r3   rU   rX   r   r   r[   r\   r]   r   r}   rY   union)r3   headsindexr7   r7   r8   prune_heads  s       zSplinterAttention.prune_headsFre   c              	   C   s<   |  |||||||}| |d |}	|	f|dd   }
|
S )Nr   r   )r3   r   )r3   rf   rg   rh   ri   rj   rk   rl   Zself_outputsattention_outputrx   r7   r7   r8   rG     s    
	zSplinterAttention.forward)N)NNNNNF)rH   rI   rJ   r!   r   r.   ry   r   rM   r   rz   rG   rO   r7   r7   r5   r8   r      s$         r   c                       s0   e Zd Z fddZejejdddZ  ZS )SplinterIntermediatec                    sB   t    t|j|j| _t|jt	r6t
|j | _n|j| _d S r   )r    r!   r   rZ   r#   intermediate_sizer}   
isinstance
hidden_actstrr
   intermediate_act_fnr2   r5   r7   r8   r!   1  s
    
zSplinterIntermediate.__init__)rf   r=   c                 C   s   |  |}| |}|S r   )r}   r   )r3   rf   r7   r7   r8   rG   9  s    

zSplinterIntermediate.forwardr   r7   r7   r5   r8   r   0  s   r   c                       s4   e Zd Z fddZejejejdddZ  ZS )SplinterOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r|   )r    r!   r   rZ   r   r#   r}   r)   r*   r+   r,   r-   r2   r5   r7   r8   r!   A  s    
zSplinterOutput.__init__r~   c                 C   s&   |  |}| |}| || }|S r   r   r   r7   r7   r8   rG   G  s    

zSplinterOutput.forwardr   r7   r7   r5   r8   r   @  s   r   c                
       st   e Zd Z fddZd
ejeej eej eej eej eeeej   ee	 eej dddZ
dd	 Z  ZS )SplinterLayerc                    sr   t    |j| _d| _t|| _|j| _|j| _| jrZ| jsLt|  dt|dd| _	t
|| _t|| _d S )Nr   z> should be used as a decoder model if cross attention is addedr   r   )r    r!   chunk_size_feed_forwardseq_len_dimr   	attentionr_   add_cross_attentionrW   crossattentionr   intermediater   r   r2   r5   r7   r8   r!   P  s    


zSplinterLayer.__init__NFre   c              	   C   s  |d k	r|d d nd }| j |||||d}	|	d }
| jrP|	dd }|	d }n|	dd  }d }| jr|d k	rt| dstd|  d|d k	r|d	d  nd }| |
||||||}|d }
||dd  }|d }|| }t| j| j| j|
}|f| }| jr||f }|S )
NrT   rl   rk   r   r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`ro   )	r   r_   rV   rW   r   r   feed_forward_chunkr   r   )r3   rf   rg   rh   ri   rj   rk   rl   Zself_attn_past_key_valueZself_attention_outputsr   rx   Zpresent_key_valueZcross_attn_present_key_valueZcross_attn_past_key_valueZcross_attention_outputslayer_outputr7   r7   r8   rG   ^  sV    


	   

zSplinterLayer.forwardc                 C   s   |  |}| ||}|S r   )r   r   )r3   r   Zintermediate_outputr   r7   r7   r8   r     s    
z SplinterLayer.feed_forward_chunk)NNNNNF)rH   rI   rJ   r!   r.   ry   r   rM   r   rz   rG   r   rO   r7   r7   r5   r8   r   O  s$         Ar   c                       s   e Zd Z fddZd	ejeej eej eej eej eeeej   ee	 ee	 ee	 ee	 e
eej ef dddZ  ZS )
SplinterEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r7   )r   ).0_r4   r7   r8   
<listcomp>  s     z,SplinterEncoder.__init__.<locals>.<listcomp>F)	r    r!   r4   r   Z
ModuleListrangenum_hidden_layerslayergradient_checkpointingr2   r5   r   r8   r!     s    
 zSplinterEncoder.__init__NFT)rf   rg   rh   ri   rj   past_key_valuesrw   rl   output_hidden_statesreturn_dictr=   c              	      st  |	rdnd } rdnd } r(| j jr(dnd }| jrJ| jrJ|rJtd d}|rRdnd }t| jD ]\}}|	rv||f }|d k	r|| nd }|d k	r|| nd | jr| jrև fdd}tj	j

|||||||}n|||||| }|d }|r||d f7 } r`||d f }| j jr`||d	 f }q`|	r@||f }|
sbtd
d |||||fD S t|||||dS )Nr7   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fc                    s    fdd}|S )Nc                     s    | f S r   r7   )inputs)modulerl   rk   r7   r8   custom_forward  s    zNSplinterEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr7   )r   r   r   )r   r8   create_custom_forward  s    z6SplinterEncoder.forward.<locals>.create_custom_forwardr   r   r   rT   c                 s   s   | ]}|d k	r|V  qd S r   r7   )r   vr7   r7   r8   	<genexpr>  s   z*SplinterEncoder.forward.<locals>.<genexpr>Zlast_hidden_stater   rf   
attentionscross_attentions)r4   r   r   ZtrainingloggerZwarning_once	enumerater   r.   utils
checkpointtupler   )r3   rf   rg   rh   ri   rj   r   rw   rl   r   r   Zall_hidden_statesZall_self_attentionsZall_cross_attentionsZnext_decoder_cacheiZlayer_moduleZlayer_head_maskr   Zlayer_outputsr7   r   r8   rG     sv    
	

zSplinterEncoder.forward)	NNNNNNFFT)rH   rI   rJ   r!   r.   ry   r   rM   r   rz   r   r   rG   rO   r7   r7   r5   r8   r     s.   	         r   c                   @   s.   e Zd ZdZeZdZdZdd Zd
ddZ	d	S )SplinterPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    splinterTc                 C   s   t |tjr:|jjjd| jjd |jdk	r|jj	  nft |tj
rz|jjjd| jjd |jdk	r|jj|j 	  n&t |tjr|jj	  |jjd dS )zInitialize the weightsg        )ZmeanZstdNg      ?)r   r   rZ   weightdataZnormal_r4   Zinitializer_rangebiasZzero_r"   r   r)   Zfill_)r3   r   r7   r7   r8   _init_weights  s    

z%SplinterPreTrainedModel._init_weightsFc                 C   s   t |tr||_d S r   )r   r   r   )r3   r   r]   r7   r7   r8   _set_gradient_checkpointing#  s    
z3SplinterPreTrainedModel._set_gradient_checkpointingN)F)
rH   rI   rJ   rK   r   config_classZbase_model_prefixZsupports_gradient_checkpointingr   r   r7   r7   r7   r8   r     s   r   aK  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`SplinterConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a/
  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `{0}`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `{0}`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `{0}`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zbThe bare Splinter Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd ZdZ fddZdd Zdd Zdd	 Zee	
d
eeeeddeej eej eej eej eej eej eej eej eeej  ee ee ee ee eeef dddZ  ZS )SplinterModela*  
    The model is an encoder (with only self-attention) following the architecture described in [Attention is all you
    need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
    Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
    c                    s2   t  | || _t|| _t|| _|   d S r   )r    r!   r4   r   rF   r   encoder	post_initr2   r5   r7   r8   r!   p  s
    

zSplinterModel.__init__c                 C   s   | j jS r   rF   r%   )r3   r7   r7   r8   get_input_embeddingsz  s    z"SplinterModel.get_input_embeddingsc                 C   s   || j _d S r   r   )r3   r]   r7   r7   r8   set_input_embeddings}  s    z"SplinterModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r3   Zheads_to_pruner   r   r7   r7   r8   _prune_heads  s    zSplinterModel._prune_headsbatch_size, sequence_lengthr   output_typer   N)r9   rg   r:   r   rh   r;   ri   rj   r   rw   rl   r   r   r=   c                 C   s  |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| j jrZ|
dk	rP|
n| j j}
nd}
|dk	rx|dk	rxtdn@|dk	r| || | }n"|dk	r| dd }ntd|\}}|dk	r|j	n|j	}|	dk	r|	d d j
d nd}|dkrtj||| f|d}|dkr.tj|tj|d	}| ||}| j jr|dk	r| \}}}||f}|dkr|tj||d}| |}nd}| || j j}| j|||||d
}| j||||||	|
|||d
}|d }|s|f|dd  S t||j|j|j|jdS )a  
        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        NFzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr   rT   )r@   r>   )r9   r   r:   r;   r<   )	rg   rh   ri   rj   r   rw   rl   r   r   r   r   )r4   rl   r   use_return_dictr_   rw   rW   Z%warn_if_padding_and_no_attention_maskrA   r@   rr   r.   ZonesrB   rC   Zget_extended_attention_maskZinvert_attention_maskZget_head_maskr   rF   r   r   r   rf   r   r   )r3   r9   rg   r:   r   rh   r;   ri   rj   r   rw   rl   r   r   rD   
batch_sizerE   r@   r<   Zextended_attention_maskZencoder_batch_sizeZencoder_sequence_lengthr   Zencoder_hidden_shapeZencoder_extended_attention_maskZembedding_outputZencoder_outputssequence_outputr7   r7   r8   rG     sx    )




zSplinterModel.forward)NNNNNNNNNNNNN)rH   rI   rJ   rK   r!   r   r   r   r   SPLINTER_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r.   ry   r   rM   rz   r   r   rG   rO   r7   r7   r5   r8   r   e  sP   
             
r   c                       s2   e Zd Zd fdd	ZejejdddZ  ZS )SplinterFullyConnectedLayergeluc                    sD   t    || _|| _t| j| j| _t| | _t	| j| _	d S r   )
r    r!   	input_dim
output_dimr   rZ   r}   r
   act_fnr)   )r3   r   r   r   r5   r7   r8   r!   	  s    

z$SplinterFullyConnectedLayer.__init__)r   r=   c                 C   s"   |  |}| |}| |}|S r   )r}   r   r)   )r3   r   rf   r7   r7   r8   rG     s    


z#SplinterFullyConnectedLayer.forward)r   r   r7   r7   r5   r8   r     s   
r   c                       s(   e Zd ZdZ fddZdd Z  ZS )QuestionAwareSpanSelectionHeadzf
    Implementation of Question-Aware Span Selection (QASS) head, described in Splinter's paper:

    c                    sz   t    t|j|j| _t|j|j| _t|j|j| _t|j|j| _tj	|j|jdd| _
tj	|j|jdd| _d S )NF)r   )r    r!   r   r#   query_start_transformquery_end_transformstart_transformend_transformr   rZ   start_classifierend_classifierr2   r5   r7   r8   r!      s    
z'QuestionAwareSpanSelectionHead.__init__c                 C   s   |  \}}}|ddd|}tj|d|d}| |}| |}| |}	| |}
| 	|}|	
ddd}	t||	}| |}|

ddd}
t||
}||fS )Nr   r   )rn   r   r   rT   )rA   	unsqueezerepeatr.   gatherr   r   r   r   r   rc   rq   r   )r3   r   	positionsr   rn   r   Zgathered_repsZquery_start_repsZquery_end_repsZ
start_repsZend_repsrf   start_logits
end_logitsr7   r7   r8   rG   +  s    





z&QuestionAwareSpanSelectionHead.forward)rH   rI   rJ   rK   r!   rG   rO   r7   r7   r5   r8   r     s   r   z
    Splinter Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       s   e Zd Z fddZeedeee	e
dd	eej eej eej eej eej eej eej eej ee ee ee eej eee	f dddZ  ZS )
SplinterForQuestionAnsweringc                    s4   t  | t|| _t|| _|j| _|   d S r   r    r!   r   r   r   splinter_qassquestion_token_idr   r2   r5   r7   r8   r!   H  s
    

z%SplinterForQuestionAnswering.__init__r   r   Nr9   rg   r:   r   rh   r;   start_positionsend_positionsrl   r   r   question_positionsr=   c                 C   s  |dk	r|n| j j}d}|dkrr|dk	rFtjt|| j dd}ntj|dtj	|j
|jd}|d}d}| j|||||||	|
|d	}|d }| ||\}}|r|d	|d	 }}|dk	r|d	| t|jj  }|d	| t|jj  }d}|dk	r|dk	rt| d	kr0|d}t| d	krL|d}|d	}|d| |d| t|d
}|||}|||}|| d }|s||f|d	d  }|dk	r|f| S |S t||||j|jdS )a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
            num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
            the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
            sequence_length)`.
        NFr   rm   r   )r?   layoutr@   Trg   r:   r   rh   r;   rl   r   r   r   Zignore_indexrT   lossr   r   rf   r   )r4   r   r.   Zargmaxeqr   rN   rB   rA   rC   r   r@   r   r   r   Zsqueezefinfor?   minr   clamp_r   r   rf   r   )r3   r9   rg   r:   r   rh   r;   r   r   rl   r   r   r   Zquestion_positions_were_noneZ"question_position_for_each_examplerx   r   r   r   
total_lossZignored_indexloss_fct
start_lossend_lossr   r7   r7   r8   rG   R  sr    $    






z$SplinterForQuestionAnswering.forward)NNNNNNNNNNNN)rH   rI   rJ   r!   r   r   r   r   r   r   r   r   r.   ry   rL   rz   r   r   rG   rO   r7   r7   r5   r8   r   @  sD   
            
r   c                   @   sl   e Zd ZU dZdZeej ed< dZ	ejed< dZ
ejed< dZeeej  ed< dZeeej  ed< dS )SplinterForPreTrainingOutputa  
    Class for outputs of Splinter as a span selection model.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when start and end positions are provided):
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
        start_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
            Span-start scores (before SoftMax).
        end_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
            Span-end scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nr   r   r   rf   r   )rH   rI   rJ   rK   r   r   r.   rM   __annotations__r   r   rf   r   r   r7   r7   r7   r8   r     s   
r   z
    Splinter Model for the recurring span selection task as done during the pretraining. The difference to the QA task
    is that we do not have a question, but multiple question tokens that replace the occurrences of recurring spans
    instead.
    c                       s   e Zd Z fddZeeddeej	 eej	 eej	 eej	 eej	 eej	 eej
 eej
 ee ee ee eej
 eeef dddZej	ej	dd	d
Z  ZS )SplinterForPreTrainingc                    s4   t  | t|| _t|| _|j| _|   d S r   r   r2   r5   r7   r8   r!     s
    

zSplinterForPreTraining.__init__z*batch_size, num_questions, sequence_lengthNr   c                 C   s  |dk	r|n| j j}|dkr6|dk	r6|dk	r6tdn,|dkrP|dkrPtdn|dkrb| |}| j|||||||	|
|d	}|d }| \}}}| ||\}}|d}|dk	r|d|||}|d| t	
|jj  }|d| t	
|jj  }d}|dk	r|dk	r|dtd|d  |dtd|d  t| j jd}|||| |||| }|||| |||| }|| d }|s||f|dd  }|dk	r|f| S |S t||||j|jd	S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
            num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
            the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
            sequence_length)`.
        NzCquestion_positions must be specified in order to calculate the lossz>question_positions must be specified when input_embeds is usedr   r   r   r   rT   r   )r4   r   	TypeError_prepare_question_positionsr   rA   r   r   r0   r.   r   r?   r   r   maxr   r$   rb   r   rf   r   )r3   r9   rg   r:   r   rh   r;   r   r   rl   r   r   r   rx   r   r   Zsequence_lengthrn   r   r   num_questionsZ attention_mask_for_each_questionr   r   r   r   r   r7   r7   r8   rG     sl    !




  zSplinterForPreTraining.forward)r9   r=   c                 C   sl   t || jjk\}}t |}t j|d| f| jjt j	|j
d}t dd |D }||||f< |S )Nr   r>   c                 S   s   g | ]}t |qS r7   )r.   r/   )r   nr7   r7   r8   r   \  s     zFSplinterForPreTraining._prepare_question_positions.<locals>.<listcomp>)r.   wherer4   r   ZbincountfullrA   r  r$   rC   r@   rp   )r3   r9   rowsZflat_positionsr  r   colsr7   r7   r8   r   S  s    
z2SplinterForPreTraining._prepare_question_positions)NNNNNNNNNNNN)rH   rI   rJ   r!   r   r   r   r   r.   ry   rL   rz   r   r   r   rG   r   rO   r7   r7   r5   r8   r     s@   	
            
dr   )9rK   rt   dataclassesr   typingr   r   r   r   r.   Ztorch.utils.checkpointr   Ztorch.nnr   Zactivationsr
   Zmodeling_outputsr   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   r   r   r   r   r   Zconfiguration_splinterr   Z
get_loggerrH   r   r   r   Z&SPLINTER_PRETRAINED_MODEL_ARCHIVE_LISTModuler   rP   r{   r   r   r   r   r   r   ZSPLINTER_START_DOCSTRINGr   r   r   r   r   r   r   r7   r7   r7   r8   <module>   sj   
	7 2Wb 2  &r