U
    9%ek                 
   @   s  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z
ddlZddlmZ ddlmZmZmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZmZmZm Z m!Z!m"Z" ddl#m$Z$ e!%e&Z'dZ(dZ)ddddddddddg
Z*dZ+dd Z,G dd dej-Z.G dd dej-Z/ej0e1e1ej0d d!d"Z2G d#d$ d$ej-Z3G d%d& d&ej-Z4G d'd( d(ej-Z5G d)d* d*ej-Z6dTej0e1e1e7e7ej0d-d.d/Z8G d0d1 d1ej-Z9G d2d3 d3ej-Z:G d4d5 d5eZ;G d6d7 d7ej-Z<eG d8d9 d9eZ=d:Z>d;Z?ed<e>G d=d> d>e;Z@ed?e>G d@dA dAe;ZAedBe> G dCdD dDe;ZBedEe>G dFdG dGe;ZCedHe>G dIdJ dJe;ZDedKe>G dLdM dMe;ZEedNe>G dOdP dPe;ZFedQe>G dRdS dSe;ZGdS )Uz" PyTorch Funnel Transformer model.    N)	dataclass)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )FunnelConfigr   zfunnel-transformer/smallfunnel-transformer/small-basezfunnel-transformer/mediumzfunnel-transformer/medium-basezfunnel-transformer/intermediatez$funnel-transformer/intermediate-basezfunnel-transformer/largezfunnel-transformer/large-basezfunnel-transformer/xlarge-basezfunnel-transformer/xlargeg    .Ac                 C   s  zddl }ddl}ddl}W n  tk
r<   td  Y nX tj|}t	d|  |j
|}g }g }	|D ]@\}
}t	d|
 d|  |j
||
}||
 |	| qrddd	d
ddddddddddd}t||	D ]\}
}|
d}
tdd |
D r t	dd|
  q|
d dkr0q| }d}|
dd D ] }t|ts|d|rt|d| d }||jk rd}||j| kr||j| 8 }|d7 }q|j| | }n||j8 }|j| }n|dkr t|tr |j} qhnd||krt||| }nJzt||}W n: tk
rb   t dd|
 |j! d}Y  qhY nX qD|st"|j!t"|j!kr|#|j!}|dkr|$|}t%&||_'q| S ) z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape k_headq_headv_head	post_projlinear_1linear_2	attentionffnweightbiasword_embeddings
embeddings)kqvoZlayer_1Zlayer_2Zrel_attnffkernelgammabetaZlookup_tableZword_embeddinginput/c                 s   s   | ]}|d kV  qdS ))Zadam_vZadam_mZAdamWeightDecayOptimizerZAdamWeightDecayOptimizer_1Zglobal_stepN ).0nr3   r3   i/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/models/funnel/modeling_funnel.py	<genexpr>r   s   z,load_tf_weights_in_funnel.<locals>.<genexpr>z	Skipping 	generatorFr   z	layer_\d+zlayer_(\d+)rTr.   )(renumpyZ
tensorflowImportErrorloggererrorospathabspathinfotrainZlist_variablesZload_variableappendzipsplitanyjoin
isinstanceFunnelPositionwiseFFN	fullmatchintsearchgroupsZnum_hidden_layersblock_sizesblockslayersFunnelRelMultiheadAttentionr_kernelgetattrAttributeErrorprintshapelenreshapeZ	transposetorchZ
from_numpydata)modelconfigZtf_checkpoint_pathr:   nptfZtf_pathZ	init_varsnamesZarraysnamerW   arrayZ
_layer_mapZpointerZskippedZm_namelayer_indexZ	block_idxr3   r3   r6   load_tf_weights_in_funnelD   s    






rd   c                       sF   e Zd Zedd fddZdeej eej ejdddZ  Z	S )	FunnelEmbeddingsNr]   returnc                    sH   t    tj|j|j|jd| _tj|j	|j
d| _t|j| _d S )N)padding_idxZeps)super__init__r   	Embedding
vocab_sizehidden_sizeZpad_token_idr'   	LayerNormd_modellayer_norm_eps
layer_normDropouthidden_dropoutdropoutselfr]   	__class__r3   r6   rk      s    
zFunnelEmbeddings.__init__)	input_idsinputs_embedsrg   c                 C   s*   |d kr|  |}| |}| |}|S N)r'   rr   ru   )rw   rz   r{   r(   r3   r3   r6   forward   s
    


zFunnelEmbeddings.forward)NN)
__name__
__module____qualname__r   rk   r   rZ   Tensorr}   __classcell__r3   r3   rx   r6   re      s       re   c                       s  e Zd ZU dZdZeed< edd fddZd$e	j
ee	j
 ee	j
 ee	j
 dd	d
Ze	j
e	j
dddZee	je	jeee	j
 eee	j
  f dddZe	j
edddZd%e	j
eee	j
dddZee	j
ee	j
 ee	j
 f eeee ee f e	j
dddZd&ee	j
ee	j
 ee	j
 f eee	j
dddZee	j
 ee	j
ee	j
 f dd d!Zee	j
 ee	j
 dd"d#Z  ZS )'FunnelAttentionStructurez>
    Contains helpers for `FunnelRelMultiheadAttention `.
       cls_token_type_idNrf   c                    s6   t    || _t|j| _t|j| _d | _d S r|   )	rj   rk   r]   r   rs   rt   sin_dropoutcos_dropoutpooling_multrv   rx   r3   r6   rk      s
    
z!FunnelAttentionStructure.__init__)r{   attention_masktoken_type_idsrg   c                 C   sv   d| _ |d | _}| ||j|j}|dk	r:| |nd}| jjrft	j
||d |d gdnd}||||fS )zCReturns the attention inputs associated to the inputs of the model.r   N)r   r   r   r   )r   sizeseq_lenget_position_embedsdtypedevicetoken_type_ids_to_matr]   separate_clsr   
functionalpadZnew_ones)rw   r{   r   r   r   position_embedstoken_type_matcls_maskr3   r3   r6   init_attention_inputs   s    	"z.FunnelAttentionStructure.init_attention_inputs)r   rg   c                 C   s^   |dddddf |dddf k}|| j k}|dddddf |dddf B }||B S )z-Convert `token_type_ids` to `token_type_mat`.N)r   )rw   r   r   Zcls_idsZcls_matr3   r3   r6   r      s    &
&z.FunnelAttentionStructure.token_type_ids_to_mat)r   r   r   rg   c                 C   s  | j j}| j jdkrtjd|d||d}tjd|d d||d}dd||d    }|dddf |d  }t|}	| |	}
t|}| |}tj	|
|
gd	d
}tj	||	gd	d
}tj	||gd	d
}tj	|	 |gd	d
}||||fS tjd|d d||d}dd||d    }tj| d |d d||d}|d }|dddf |d  }| t|}	| t|}tj	|	|gd	d
}tjd|||d}|}g }t
d| j jD ]}|dkrd}n^| ||}d|d  }| j|||dd}|dddf | }||d|}t|d|}|}d| }| ||}|dddf | }||d|}t|d|}|||g q|S dS )a  
        Create and cache inputs related to relative position encoding. Those are very different depending on whether we
        are using the factorized or the relative shift attention:

        For the factorized attention, it returns the matrices (phi, pi, psi, omega) used in the paper, appendix A.2.2,
        final formula.

        For the relative shift attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
        formula.

        Paper link: https://arxiv.org/abs/2006.03236
        
factorizedr         ?r   r   r   r   i'  Ndim)shift)r]   rp   attention_typerZ   arangesinr   cosr   catrangeZ
num_blocksstride_pool_posrelative_posexpandr   gatherrD   )rw   r   r   r   rp   Zpos_seqZfreq_seqZinv_freqZsinusoidZ	sin_embedZsin_embed_dZ	cos_embedZcos_embed_dphipsipiomegaZ
rel_pos_idZzero_offsetZ	pos_embedpos
pooled_posZposition_embeds_listblock_indexZposition_embeds_poolingstrideZrel_posZposition_embeds_no_poolingr3   r3   r6   r      sV    




z,FunnelAttentionStructure.get_position_embeds)pos_idr   c                 C   sj   | j jrX|d|  d g}| j jr2|dd n
|dd }t||ddd gdS |ddd S dS )ze
        Pool `pos_id` while keeping the cls token separate (if `config.separate_cls=True`).
        r   r   r   Nr   )r]   r   Z
new_tensortruncate_seqrZ   r   )rw   r   r   Zcls_posZpooled_pos_idr3   r3   r6   r   +  s
     z(FunnelAttentionStructure.stride_pool_posr   )r   r   r   rg   c           	      C   sb   |dkr|}|d |d  }|t | }|||  }|d |d  }tj||d | tj|jdS )zV
        Build the relative positional vector between `pos` and `pooled_pos`.
        Nr   r   r   r   )rX   rZ   r   longr   )	rw   r   r   r   r   Z	ref_pointZ
num_removeZmax_distZmin_distr3   r3   r6   r   :  s    z%FunnelAttentionStructure.relative_pos)tensoraxisrg   c                    s   |dkrdS t  ttfr4 D ]}||}q|S t |ttfr^t| fdd|D S  |j;  jjrjjrt	dddn
t	ddd}t	dg  |g }jjrt	dg  t	ddg }t
j|| |g d}|| S )zT
        Perform pooling by stride slicing the tensor along the given axis.
        Nc                 3   s   | ]} | V  qd S r|   )stride_poolr4   xr   rw   r3   r6   r7   [  s     z7FunnelAttentionStructure.stride_pool.<locals>.<genexpr>r   r   r   )r   )rI   listtupler   typendimr]   r   r   slicerZ   r   )rw   r   r   ZaxZ
axis_sliceZ	enc_sliceZ	cls_slicer3   r   r6   r   H  s     
&z$FunnelAttentionStructure.stride_poolmean)r   moder   rg   c                    s  dkrdS t ttfr:t fddD S jjrjjr^ddddf n}tjddddf |gddj	}|dkrddddddf n$|dkrЈdddddddf df d	krt
jjd
dnL dkrt
jjd
dn, dkr:t
jj d
d ntd|dkrdddddddf S |dkr~dddf S S )z3Apply 1D pooling to a tensor of size [B x T (x H)].Nc                 3   s   | ]}j  d V  qdS ))r   r   N)pool_tensorr   r   rw   r   r   r3   r6   r7   r  s     z7FunnelAttentionStructure.pool_tensor.<locals>.<genexpr>r   r   r   r   r   r   T)r   Z	ceil_modemaxminz0The supported modes are 'mean', 'max' and 'min'.r   )rI   r   r   r   r]   r   r   rZ   r   r   r   r   Z
avg_pool2dZ
max_pool2dNotImplementedError)rw   r   r   r   suffixr   r3   r   r6   r   i  s2      "



z$FunnelAttentionStructure.pool_tensor)attention_inputsrg   c                 C   s   |\}}}}| j jrl| j jdkr@| |dd d|dd  }| |d}| |d}| j|| j jd}nf|  jd9  _| j jdkr| |d}| |ddg}| |ddg}| j|dd}| j|| j jd}||||f}||fS )zTPool `output` and the proper parts of `attention_inputs` before the attention layer.r   Nr   r   r   r   r   )r]   pool_q_onlyr   r   r   Zpooling_typer   )rw   outputr   r   r   r   r   r3   r3   r6   pre_attention_pooling  s      z.FunnelAttentionStructure.pre_attention_poolingc                 C   s   |\}}}}| j jrt|  jd9  _| j jdkrN|dd | |dd d }| |d}| |d}| j|dd}||||f}|S )zFPool the proper parts of `attention_inputs` after the attention layer.r   r   Nr   r   r   r   )r]   r   r   r   r   r   )rw   r   r   r   r   r   r3   r3   r6   post_attention_pooling  s     z/FunnelAttentionStructure.post_attention_pooling)NN)Nr   )r   r   )r~   r   r   __doc__r   rL   __annotations__r   rk   rZ   r   r   r   r   r   r   r   r   r   r   r   r   r   strr   r   r   r   r3   r3   rx   r6   r      sF   
  	  P"     'r   )positional_attncontext_lenr   rg   c                 C   sn   | j \}}}}t| ||||g} | d d d d |d d d f } t| ||||| g} | dd |f } | S )N.)rW   rZ   rY   )r   r   r   
batch_sizen_headr   Zmax_rel_lenr3   r3   r6   _relative_shift_gather  s     r   c                	       sj   e Zd Zeedd fddZdddZdddZdej	ej	ej	e
ej	 ee
ej	d
f dddZ  ZS )rR   Nr]   r   rg   c                    s*  t    || _|| _|j|j|j  }}}t|j	| _	t|j
| _
tj||| dd| _t||| | _t||| | _tt||g| _tt||g| _tt|||g| _tt||g| _ttd||g| _t|| || _tj||jd| _d|d  | _d S )NF)r&   r   ri   r   g      ?)rj   rk   r]   r   rp   r   d_headr   rs   rt   attention_dropoutLinearr   r   r   	ParameterrZ   zerosr_w_biasr_r_biasrS   r_s_bias	seg_embedr    ro   rq   rr   scale)rw   r]   r   rp   r   r   rx   r3   r6   rk     s"    
z$FunnelRelMultiheadAttention.__init__c                 C   s   | j jdkr|\}}}}| j| j }	| j}
td||	 |
}||dddf  }||dddf  }td||td|| }nf|jd |krdnd}|| j |d  }| j| j }| j}
td||
}td|| |}t	|||}|dk	r||9 }|S )	z5Relative attention score for the positional encodingsr   zbinh,dnh->bindNzbind,jd->bnijr   r   ztd,dnh->tnhzbinh,tnh->bnit)
r]   r   r   r   rS   rZ   einsumrW   r   r   )rw   r   r   r   r   r   r   r   r   uZw_rZq_r_attentionZq_r_attention_1Zq_r_attention_2r   r   r9   r+   Zr_headr3   r3   r6   relative_positional_attention  s,      z9FunnelRelMultiheadAttention.relative_positional_attentionc                 C   s   |dkrdS |j \}}}| j| j }td|| | j}|dddf ||j d ||g}tj|ddd\}	}
t||
|j |	|j }|dk	r||9 }|S )z/Relative attention score for the token_type_idsNr   zbind,snd->bnisr   r   r   r   )	rW   r   r   rZ   r   r   r   rF   where)rw   r   r   r   r   r   r   r   Ztoken_type_biasZdiff_token_typeZsame_token_typetoken_type_attnr3   r3   r6   relative_token_type_attention  s    $ 
 
z9FunnelRelMultiheadAttention.relative_token_type_attentionF.)querykeyvaluer   output_attentionsrg   c                 C   sn  |\}}}}	|j \}
}}|j d }| jj| jj }}| ||
|||}| ||
|||}| ||
|||}|| j }| j	| j }t
d|| |}| ||||	}| |||	}|| | }|j}| }|d k	r|td|d d d d f     }t
j|d|d}| |}t
d||}| ||
||| }| |}| || }|rh||fS |fS )Nr   zbind,bjnd->bnijr   )r   r   zbnij,bjnd->bind)rW   r]   r   r   r   viewr   r   r   r   rZ   r   r   r   r   floatINFZsoftmaxr   r    rY   rt   rr   )rw   r   r   r   r   r   r   r   r   r   r   r   _r   r   r   r   r   r   r   Zcontent_scorer   r   Z
attn_scorer   Z	attn_probZattn_vecZattn_outr   r3   r3   r6   r}     s0    



"

z#FunnelRelMultiheadAttention.forward)N)N)F)r~   r   r   r   rL   rk   r   r   rZ   r   r   boolr}   r   r3   r3   rx   r6   rR     s   
*
 rR   c                       s8   e Zd Zedd fddZejejdddZ  ZS )rJ   Nrf   c                    sl   t    t|j|j| _t|j | _	t
|j| _t|j|j| _t
|j| _t|j|j| _d S r|   )rj   rk   r   r   rp   Zd_innerr!   r   
hidden_actactivation_functionrs   activation_dropoutr"   rt   ru   ro   rq   rr   rv   rx   r3   r6   rk   T  s    
zFunnelPositionwiseFFN.__init__hiddenrg   c                 C   s@   |  |}| |}| |}| |}| |}| || S r|   )r!   r   r   r"   ru   rr   )rw   r   hr3   r3   r6   r}   ]  s    




zFunnelPositionwiseFFN.forward)	r~   r   r   r   rk   rZ   r   r}   r   r3   r3   rx   r6   rJ   S  s   	rJ   c                       sD   e Zd Zeedd fddZd	ejejejee	dddZ
  ZS )
FunnelLayerNr   c                    s$   t    t||| _t|| _d S r|   )rj   rk   rR   r#   rJ   r$   )rw   r]   r   rx   r3   r6   rk   g  s    
zFunnelLayer.__init__F)r   r   r   r   rg   c                 C   s8   | j |||||d}| |d }|r2||d fS |fS )Nr   r   r   )r#   r$   )rw   r   r   r   r   r   Zattnr   r3   r3   r6   r}   l  s    zFunnelLayer.forward)F)r~   r   r   r   rL   rk   rZ   r   r   r   r}   r   r3   r3   rx   r6   r   f  s    r   c                
       sV   e Zd Zedd fddZd
ejeej eej eeee	e
ef ddd	Z  ZS )FunnelEncoderNrf   c                    s>   t     | _t | _t fddt jD | _	d S )Nc                    s.   g | ]&\ }t  fd dt|D qS )c                    s   g | ]}t  qS r3   r   r4   r   )r   r]   r3   r6   
<listcomp>  s     z5FunnelEncoder.__init__.<locals>.<listcomp>.<listcomp>)r   
ModuleListr   )r4   
block_sizer]   )r   r6   r     s   z*FunnelEncoder.__init__.<locals>.<listcomp>)
rj   rk   r]   r   attention_structurer   r   	enumeraterO   rP   rv   rx   r   r6   rk   z  s    


zFunnelEncoder.__init__FT)r{   r   r   r   output_hidden_statesreturn_dictrg   c              
   C   sl  | |}| jj|||d}|}|r*|fnd }	|r6dnd }
t| jD ]\}}|d| jjr`dndk}|op|dk}|r| j||\}}t|D ]\}}t	| jj
| D ]}|dko|dko|}|r|}| jjr|n| }}n| } }}||||||d}|d }|r| j|}|r,|
|dd   }
|r|	|f }	qqqD|s^tdd ||	|
fD S t||	|
d	S )
Nr   r   r3   r   r   r   r   c                 s   s   | ]}|d k	r|V  qd S r|   r3   r4   r+   r3   r3   r6   r7     s      z(FunnelEncoder.forward.<locals>.<genexpr>last_hidden_statehidden_states
attentions)Ztype_asr   r   r   rP   r   r]   r   r   r   Zblock_repeatsr   r   r   r   )rw   r{   r   r   r   r   r  r   r   all_hidden_statesall_attentionsr   blockZpooling_flagZpooled_hiddenrc   layerZrepeat_indexZ
do_poolingr   r   r   layer_outputr3   r3   r6   r}     sF    

 zFunnelEncoder.forward)NNFFTr~   r   r   r   rk   rZ   r   r   r   r   r   r   r}   r   r3   r3   rx   r6   r   y  s        
r   TF)r   r   
target_lenr   r   rg   c              	   C   s   |dkr| S |r8| ddddf }| ddddf } t j| |dd}|r|rntj|ddd|d ddf}|ddd|d f }t j||gdd}n|ddd|f }|S )z{
    Upsample tensor `x` to match `target_len` by repeating the tokens `stride` time on the sequence length dimension.
    r   N)Zrepeatsr   r   r   )rZ   Zrepeat_interleaver   r   r   r   )r   r   r  r   r   clsr   r3   r3   r6   upsample  s    r  c                       sZ   e Zd Zedd fddZd
ejejeej eej eeee	e
ef ddd	Z  ZS )FunnelDecoderNrf   c                    s>   t     | _t | _t fddt jD | _	d S )Nc                    s   g | ]}t  d qS )r   r   r   r   r3   r6   r     s     z*FunnelDecoder.__init__.<locals>.<listcomp>)
rj   rk   r]   r   r   r   r   r   Znum_decoder_layersrQ   rv   rx   r   r6   rk     s    

zFunnelDecoder.__init__FT)final_hiddenfirst_block_hiddenr   r   r   r   r  rg   c                 C   s   t |dt| jjd  |jd | jj| jjd}|| }	|rB|	fnd }
|rNdnd }| jj|	||d}| j	D ]@}||	|	|	||d}|d }	|r||dd   }|rj|
|	f }
qj|st
dd	 |	|
|fD S t|	|
|d
S )Nr   r   )r   r  r   r   r3   r  r   r   c                 s   s   | ]}|d k	r|V  qd S r|   r3   r  r3   r3   r6   r7     s      z(FunnelDecoder.forward.<locals>.<genexpr>r  )r  rX   r]   rO   rW   r   r   r   r   rQ   r   r   )rw   r  r  r   r   r   r   r  Zupsampled_hiddenr   r  r	  r   r  r  r3   r3   r6   r}     s2    

zFunnelDecoder.forward)NNFFTr  r3   r3   rx   r6   r    s    
     
r  c                       s<   e Zd ZdZedd fddZejejdddZ  Z	S )	FunnelDiscriminatorPredictionszEPrediction module for the discriminator, made up of two dense layers.Nrf   c                    s6   t    || _t|j|j| _t|jd| _d S Nr   )rj   rk   r]   r   r   rp   densedense_predictionrv   rx   r3   r6   rk     s    
z'FunnelDiscriminatorPredictions.__init__)discriminator_hidden_statesrg   c                 C   s,   |  |}t| jj |}| | }|S r|   )r  r   r]   r   r  squeeze)rw   r  r  logitsr3   r3   r6   r}     s    
z&FunnelDiscriminatorPredictions.forward)
r~   r   r   r   r   rk   rZ   r   r}   r   r3   r3   rx   r6   r    s   r  c                   @   s$   e Zd ZdZeZeZdZdd Z	dS )FunnelPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    funnelc                 C   sj  |j j}|ddkrt|dd d k	rp| jjd krV|jj\}}t	dt
||  }n| jj}tjj|j|d t|dd d k	rtj|jd n|dkrtjj|j| jjd	 tjj|j| jjd	 tjj|j| jjd	 tjj|j| jjd	 tjj|j| jjd	 nZ|d
krf| jjd kr(dn| jj}tjj|jj|d |jjd k	rf|jjj|j   d S )Nr   r   r%   r   )stdr&   g        rR   )bre   )ry   r~   findrT   r]   Zinitializer_stdr%   rW   r^   sqrtr   r   initZnormal_Z	constant_r&   Zuniform_r   Zinitializer_ranger   rS   r   r   r'   rh   r[   Zzero_)rw   module	classnameZfan_outZfan_inr  r3   r3   r6   _init_weights  s*    

z#FunnelPreTrainedModel._init_weightsN)
r~   r   r   r   r   config_classrd   Zload_tf_weightsZbase_model_prefixr$  r3   r3   r3   r6   r    s
   r  c                       s:   e Zd Zeedd fddZejejdddZ  Z	S )FunnelClassificationHeadN)r]   n_labelsrg   c                    s>   t    t|j|j| _t|j| _t|j|| _	d S r|   )
rj   rk   r   r   rp   linear_hiddenrs   rt   ru   
linear_out)rw   r]   r'  rx   r3   r6   rk   3  s    
z!FunnelClassificationHead.__init__r   c                 C   s(   |  |}t|}| |}| |S r|   )r(  rZ   tanhru   r)  )rw   r   r3   r3   r6   r}   9  s    


z FunnelClassificationHead.forward)
r~   r   r   r   rL   rk   rZ   r   r}   r   r3   r3   rx   r6   r&  2  s   r&  c                   @   s^   e Zd ZU dZdZeej ed< dZ	ejed< dZ
eeej  ed< dZeeej  ed< dS )FunnelForPreTrainingOutputa  
    Output type of [`FunnelForPreTraining`].

    Args:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Total loss of the ELECTRA-style objective.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Prediction scores of the head (scores for each token before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossr  r  r  )r~   r   r   r   r,  r   rZ   ZFloatTensorr   r  r  r   r  r3   r3   r3   r6   r+  @  s
   
r+  a(  

    The Funnel Transformer model was proposed in [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient
    Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`FunnelConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z
    The base Funnel Transformer Model transformer outputting raw hidden-states without upsampling head (also called
    decoder) or any task-specific head on top.
    c                       s   e Zd Zedd fddZejdddZejddd	d
Ze	e
dedeeddeej eej eej eej eej eej ee ee ee eeef d
ddZ  ZS )FunnelBaseModelNrf   c                    s,   t  | t|| _t|| _|   d S r|   )rj   rk   re   r(   r   encoder	post_initrv   rx   r3   r6   rk     s    

zFunnelBaseModel.__init__rg   c                 C   s   | j jS r|   r(   r'   rw   r3   r3   r6   get_input_embeddings  s    z$FunnelBaseModel.get_input_embeddingsnew_embeddingsrg   c                 C   s   || j _d S r|   r1  rw   r5  r3   r3   r6   set_input_embeddings  s    z$FunnelBaseModel.set_input_embeddingsbatch_size, sequence_lengthr   
checkpointoutput_typer%  )
rz   r   r   position_ids	head_maskr{   r   r   r  rg   c
                 C   s  |d k	r|n| j j}|d k	r |n| j j}|	d k	r4|	n| j j}	|d k	rV|d k	rVtdn@|d k	rt| || | }
n"|d k	r| d d }
ntd|d k	r|jn|j}|d krtj	|
|d}|d krtj
|
tj|d}|d kr| |}| j||||||	d}|S )NDYou cannot specify both input_ids and inputs_embeds at the same timer   5You have to specify either input_ids or inputs_embedsr   r   r   r   r   r   r  )r]   r   r   use_return_dict
ValueError%warn_if_padding_and_no_attention_maskr   r   rZ   onesr   r   r(   r.  )rw   rz   r   r   r<  r=  r{   r   r   r  input_shaper   encoder_outputsr3   r3   r6   r}     s8    


	zFunnelBaseModel.forward)	NNNNNNNNN)r~   r   r   r   rk   r   rl   r3  r7  r   FUNNEL_INPUTS_DOCSTRINGformatr   r   _CONFIG_FOR_DOCr   rZ   r   r   r   r   r}   r   r3   r3   rx   r6   r-    s<   	         
r-  zlThe bare Funnel Transformer Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Zedd fddZejdddZejddd	d
Ze	e
deeeeddeej eej eej eej ee ee ee eeef dddZ  ZS )FunnelModelNrf   c                    s<   t  | || _t|| _t|| _t|| _| 	  d S r|   )
rj   rk   r]   re   r(   r   r.  r  decoderr/  rv   rx   r3   r6   rk     s    


zFunnelModel.__init__r0  c                 C   s   | j jS r|   r1  r2  r3   r3   r6   r3    s    z FunnelModel.get_input_embeddingsr4  c                 C   s   || j _d S r|   r1  r6  r3   r3   r6   r7    s    z FunnelModel.set_input_embeddingsr8  r9  )rz   r   r   r{   r   r   r  rg   c              	   C   s  |d k	r|n| j j}|d k	r |n| j j}|d k	r4|n| j j}|d k	rV|d k	rVtdn@|d k	rt| || | }n"|d k	r| d d }ntd|d k	r|jn|j}	|d krtj	||	d}|d krtj
|tj|	d}|d kr| |}| j||||d|d}
| j|
d |
d	 | j jd  |||||d
}|sd}|d f}|rf|d	7 }||
d	 ||  f }|r|d	7 }||
d ||  f }|S t|d |r|
j|j nd |r|
j|j nd dS )Nr>  r   r?  r@  r   TrA  r   r   )r  r  r   r   r   r   r  r   r  )r]   r   r   rB  rC  rD  r   r   rZ   rE  r   r   r(   r.  rL  rO   r   r  r  )rw   rz   r   r   r{   r   r   r  rF  r   rG  Zdecoder_outputsidxoutputsr3   r3   r6   r}     sj    


	

zFunnelModel.forward)NNNNNNN)r~   r   r   r   rk   r   rl   r3  r7  r   rH  rI  r   _CHECKPOINT_FOR_DOCr   rJ  r   rZ   r   r   r   r   r}   r   r3   r3   rx   r6   rK    s4   
       
rK  z
    Funnel Transformer model with a binary classification head on top as used during pretraining for identifying
    generated tokens.
    c                       s   e Zd Zedd fddZeedee	e
dd
eej eej eej eej eej ee ee ee eee	f d	dd	Z  ZS )FunnelForPreTrainingNrf   c                    s,   t  | t|| _t|| _|   d S r|   )rj   rk   rK  r  r  discriminator_predictionsr/  rv   rx   r3   r6   rk   X  s    

zFunnelForPreTraining.__init__r8  )r;  r%  	rz   r   r   r{   labelsr   r   r  rg   c	              	   C   s   |dk	r|n| j j}| j|||||||d}	|	d }
| |
}d}|dk	rt }|dk	r|d|
jd dk}|d|
jd | }|| }||| }n||d|
jd | }|s|f|	dd  }|dk	r|f| S |S t	|||	j
|	jdS )a4  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the ELECTRA-style loss. Input should be a sequence of tokens (see `input_ids`
            docstring) Indices should be in `[0, 1]`:

            - 0 indicates the token is an original token,
            - 1 indicates the token was replaced.

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, FunnelForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("funnel-transformer/small")
        >>> model = FunnelForPreTraining.from_pretrained("funnel-transformer/small")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> logits = model(**inputs).logits
        ```Nr   r   r{   r   r   r  r   r   r   r,  r  r  r  )r]   rB  r  rQ  r   r   r   rW   r   r+  r  r  )rw   rz   r   r   r{   rS  r   r   r  r  Zdiscriminator_sequence_outputr  r,  loss_fctZactive_lossZactive_logitsZactive_labelsr   r3   r3   r6   r}   `  s<    #	
zFunnelForPreTraining.forward)NNNNNNNN)r~   r   r   r   rk   r   rH  rI  r   r+  rJ  r   rZ   r   r   r   r   r}   r   r3   r3   rx   r6   rP  W  s,   
        
rP  z@Funnel Transformer Model with a `language modeling` head on top.c                       s   e Zd ZdgZedd fddZejdddZej	dd	d
dZ
eedeeeedddeej eej eej eej eej ee ee ee eeef d	ddZ  ZS )FunnelForMaskedLMzlm_head.weightNrf   c                    s4   t  | t|| _t|j|j| _| 	  d S r|   )
rj   rk   rK  r  r   r   rp   rm   lm_headr/  rv   rx   r3   r6   rk     s    
zFunnelForMaskedLM.__init__r0  c                 C   s   | j S r|   rX  r2  r3   r3   r6   get_output_embeddings  s    z'FunnelForMaskedLM.get_output_embeddingsr4  c                 C   s
   || _ d S r|   rY  r6  r3   r3   r6   set_output_embeddings  s    z'FunnelForMaskedLM.set_output_embeddingsr8  z<mask>)r:  r;  r%  maskrR  c	              	   C   s   |dk	r|n| j j}| j|||||||d}	|	d }
| |
}d}|dk	rlt }||d| j j|d}|s|f|	dd  }|dk	r|f| S |S t|||	j|	j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        NrT  r   r   r   rU  )
r]   rB  r  rX  r	   r   rm   r   r  r  )rw   rz   r   r   r{   rS  r   r   r  rN  r  Zprediction_logitsZmasked_lm_lossrV  r   r3   r3   r6   r}     s2    

zFunnelForMaskedLM.forward)NNNNNNNN)r~   r   r   Z_tied_weights_keysr   rk   r   r   rZ  rl   r[  r   rH  rI  r   rO  r   rJ  r   rZ   r   r   r   r   r}   r   r3   r3   rx   r6   rW    s<   	        
rW  z
    Funnel Transformer Model with a sequence classification/regression head on top (two linear layer on top of the
    first timestep of the last hidden state) e.g. for GLUE tasks.
    c                       s   e Zd Zedd fddZeedede	e
ddeej eej eej eej eej ee ee ee eee	f d	d	d
Z  ZS )FunnelForSequenceClassificationNrf   c                    s>   t  | |j| _|| _t|| _t||j| _|   d S r|   )	rj   rk   
num_labelsr]   r-  r  r&  
classifierr/  rv   rx   r3   r6   rk     s    
z(FunnelForSequenceClassification.__init__r8  r   r9  rR  c	              	   C   s~  |dk	r|n| j j}| j|||||||d}	|	d }
|
dddf }| |}d}|dk	r:| j jdkr| jdkr|d| j _n4| jdkr|jtjks|jtj	krd| j _nd| j _| j jdkrt
 }| jdkr|| | }n
|||}nN| j jdkrt }||d| j|d}n| j jdkr:t }|||}|sj|f|	dd  }|dk	rf|f| S |S t|||	j|	jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NrT  r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationr   rU  )r]   rB  r  r_  Zproblem_typer^  r   rZ   r   rL   r
   r  r	   r   r   r   r  r  )rw   rz   r   r   r{   rS  r   r   r  rN  r  pooled_outputr  r,  rV  r   r3   r3   r6   r}     sR    




"


z'FunnelForSequenceClassification.forward)NNNNNNNN)r~   r   r   r   rk   r   rH  rI  r   r   rJ  r   rZ   r   r   r   r   r}   r   r3   r3   rx   r6   r]    s4   
        
r]  z
    Funnel Transformer Model with a multiple choice classification head on top (two linear layer on top of the first
    timestep of the last hidden state, and a softmax) e.g. for RocStories/SWAG tasks.
    c                       s   e Zd Zedd fddZeedede	e
ddeej eej eej eej eej ee ee ee eee	f d	d	d
Z  ZS )FunnelForMultipleChoiceNrf   c                    s.   t  | t|| _t|d| _|   d S r  )rj   rk   r-  r  r&  r_  r/  rv   rx   r3   r6   rk   X  s    
z FunnelForMultipleChoice.__init__z(batch_size, num_choices, sequence_lengthr   r9  rR  c	              	   C   sX  |dk	r|n| j j}|dk	r&|jd n|jd }	|dk	rJ|d|dnd}|dk	rh|d|dnd}|dk	r|d|dnd}|dk	r|d|d|dnd}| j|||||||d}
|
d }|dddf }| |}|d|	}d}|dk	rt }|||}|sD|f|
dd  }|dk	r@|f| S |S t|||
j	|
j
dS )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r   rT  r   rU  )r]   rB  rW   r   r   r  r_  r	   r   r  r  )rw   rz   r   r   r{   rS  r   r   r  Znum_choicesrN  r  r`  r  Zreshaped_logitsr,  rV  r   r3   r3   r6   r}   `  sF    



zFunnelForMultipleChoice.forward)NNNNNNNN)r~   r   r   r   rk   r   rH  rI  r   r   rJ  r   rZ   r   r   r   r   r}   r   r3   r3   rx   r6   ra  P  s4           
ra  z
    Funnel Transformer Model with a token classification head on top (a linear layer on top of the hidden-states
    output) e.g. for Named-Entity-Recognition (NER) tasks.
    c                       s   e Zd Zedd fddZeedee	e
edd
eej eej eej eej eej ee ee ee eee
f d	dd	Z  ZS )FunnelForTokenClassificationNrf   c                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S r|   )rj   rk   r^  rK  r  r   rs   rt   ru   r   rn   r_  r/  rv   rx   r3   r6   rk     s    
z%FunnelForTokenClassification.__init__r8  r9  rR  c	              	   C   s   |dk	r|n| j j}| j|||||||d}	|	d }
| |
}
| |
}d}|dk	rtt }||d| j|d}|s|f|	dd  }|dk	r|f| S |S t|||	j	|	j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        NrT  r   r   r   rU  )r]   rB  r  ru   r_  r	   r   r^  r   r  r  )rw   rz   r   r   r{   rS  r   r   r  rN  r  r  r,  rV  r   r3   r3   r6   r}     s4    


z$FunnelForTokenClassification.forward)NNNNNNNN)r~   r   r   r   rk   r   rH  rI  r   rO  r   rJ  r   rZ   r   r   r   r   r}   r   r3   r3   rx   r6   rc    s4           
rc  z
    Funnel Transformer Model with a span classification head on top for extractive question-answering tasks like SQuAD
    (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       s   e Zd Zedd fddZeedee	e
edd
eej eej eej eej eej eej ee ee ee eee
f d
dd	Z  ZS )FunnelForQuestionAnsweringNrf   c                    s<   t  | |j| _t|| _t|j|j| _| 	  d S r|   )
rj   rk   r^  rK  r  r   r   rn   
qa_outputsr/  rv   rx   r3   r6   rk     s
    
z#FunnelForQuestionAnswering.__init__r8  r9  )
rz   r   r   r{   start_positionsend_positionsr   r   r  rg   c
              	   C   sL  |	dk	r|	n| j j}	| j|||||||	d}
|
d }| |}|jddd\}}|d }|d }d}|dk	r|dk	rt| dkr|	d}t| dkr|d}|d}|
d|}|
d|}t|d}|||}|||}|| d }|	s6||f|
dd  }|dk	r2|f| S |S t||||
j|
jd	S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        NrT  r   r   r   r   )Zignore_indexr   )r,  start_logits
end_logitsr  r  )r]   rB  r  re  rF   r  
contiguousrX   r   Zsquezeclampr	   r   r  r  )rw   rz   r   r   r{   rf  rg  r   r   r  rN  r  r  rh  ri  Z
total_lossZignored_indexrV  Z
start_lossZend_lossr   r3   r3   r6   r}     sL    







z"FunnelForQuestionAnswering.forward)	NNNNNNNNN)r~   r   r   r   rk   r   rH  rI  r   rO  r   rJ  r   rZ   r   r   r   r   r}   r   r3   r3   rx   r6   rd    s8   
         
rd  )TF)Hr   r?   dataclassesr   typingr   r   r   r   r;   r^   rZ   r   Ztorch.nnr   r	   r
   Zactivationsr   Zmodeling_outputsr   r   r   r   r   r   Zmodeling_utilsr   utilsr   r   r   r   r   r   Zconfiguration_funnelr   Z
get_loggerr~   r=   rJ  rO  Z$FUNNEL_PRETRAINED_MODEL_ARCHIVE_LISTr   rd   Modulere   r   r   rL   r   rR   rJ   r   r   r   r  r  r  r  r&  r+  ZFUNNEL_START_DOCSTRINGrH  r-  rK  rP  rW  r]  ra  rc  rd  r3   r3   r3   r6   <module>   s     
Z   @       1#'Ic	RJULB