U
    ,-e_                     @   s  d Z ddlZddlmZmZmZ ddlZddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZmZ ddl m!Z! e"e#Z$dZ%dZ&dZ'dgZ(e)e)ej*dddZ+ej,j-dd Z.ej*ej*dddZ/ej*ej*ej*ej*dddZ0G dd de	j1Z2G dd de	j1Z3G d d! d!e	j1Z4G d"d# d#eZ5d$Z6d%Z7d&Z8d'Z9ed(e6G d)d* d*e5Z:ed+e6G d,d- d-e5Z;ed.e6G d/d0 d0e5Z<ed1e6G d2d3 d3e5Z=dS )4z PyTorch GPT-J model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputWithPastCausalLMOutputWithPastQuestionAnsweringModelOutput SequenceClassifierOutputWithPast)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardis_torch_fx_proxylogging)assert_device_mapget_device_map   )
GPTJConfigz$hf-internal-testing/tiny-random-gptjzEleutherAI/gpt-j-6Br   )num_posdimreturnc                 C   sV   ddt d|d|   }t dt j| t jd| }t jt |t |fddS )	N      ?i'  r      zi , j -> i jdtyper   r   )torcharangeZeinsumfloatcatsincos)r   r   Zinv_freqZsinusoid_inp r'   g/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/gptj/modeling_gptj.pycreate_sinusoidal_positions:   s    r)   c                 C   s   |  |j|jd ddS Nr   r   )todevicerepeatshape)embed_positionsposition_idsr'   r'   r(   get_embed_positions@   s    r1   )xr   c                 C   sb   | d d d d d d d d df }| d d d d d d dd df }t j| |fdd} | dS )Nr   r   r    )r!   stackflatten)r2   x1Zx2r'   r'   r(   rotate_every_twoE   s    ""r8   )tensorr%   r&   r   c                 C   s`   t |d d d d d d d f dd}t |d d d d d d d f dd}| | t| |  S )Nr   r	   )r!   Zrepeat_interleaver8   )r9   r%   r&   r'   r'   r(   apply_rotary_pos_embL   s    &&r:   c                       s   e Zd Z fddZdd Zdd Zddd	Zd
d Zdej	e
eej  e
ej	 e
ej e
ej	 e
e e
e eeejeej f e
eejeej eejdf f  f dddZ  ZS )GPTJAttentionc                    s`  t    |j}| jdttj||ftjddd||dd | jdt	ddd t
|j| _t
|j| _|j| _|j| _| j| j | _| j| j | jkrtd| j d	| j d
ttj	| jtjdt | _t
j| j| jdd| _t
j| j| jdd| _t
j| j| jdd| _t
j| j| jdd| _|j| _| jpN| j}t ||| _!d S )Nbiasr   r   F)
persistentZmasked_biasg    ezEembed_dim must be divisible by num_attention_heads (got `embed_dim`: z and `num_attention_heads`: z).r<   )"super__init__Zmax_position_embeddingsZregister_bufferr!   ZtrilZonesboolviewr9   r   DropoutZ
attn_pdropattn_dropoutresid_pdropresid_dropouthidden_size	embed_dimnum_attention_headshead_dim
ValueErrorsqrtfloat32r+   Zget_default_dtype
scale_attnLineark_projv_projq_projout_proj
rotary_dimr)   r/   )selfconfigZmax_positionsZpos_embd_dim	__class__r'   r(   r@   S   s<    
   $zGPTJAttention.__init__c                 C   s   |  dd ||f }||}|r*|S t|jdkrJ|dddddS t|jdkrh|ddddS td	t|j dS )
zO
        Splits hidden dim into attn_head_size and num_attention_heads
        Nr3      r   r   r	   r      3Input tensor rank should be one of [4, 5], but is: )sizerB   lenr.   permuterK   )rU   r9   rI   attn_head_sizeZrotary	new_shaper'   r'   r(   _split_headsu   s    
zGPTJAttention._split_headsc                 C   s   t |jdkr&|ddddd }n8t |jdkrJ|dddd }ntdt |j | dd	 || f }||S )
zR
        Merges attn_head_size dim and num_attn_heads dim into hidden dim
        rY   r   r   r	   r   rZ   r[   Nr4   )r]   r.   r^   
contiguousrK   r\   rB   )rU   r9   rI   r_   r`   r'   r'   r(   _merge_heads   s    zGPTJAttention._merge_headsNc                 C   s   | d| d }}| jd d d d || |d |f }|tj}|tj}t||dd}	t|	jj	}
tj
|
|	jd|	j}
t||	|
}	|	| j }	|d k	r|	| }	tjj|	dd}	|	|j}	| |	}	|d k	r|	| }	t|	|}||	fS )Nr4   r3   r   r    )r\   r<   r+   r!   rM   matmulZ	transposefinfor   minr9   r,   whererN   r   Z
functionalZsoftmaxrD   )rU   querykeyvalueattention_mask	head_maskZquery_lengthZ
key_lengthZcausal_maskattn_weightsZ
mask_valueattn_outputr'   r'   r(   _attn   s$    	&

zGPTJAttention._attnc                 C   s8   | j }|j|jkr$||j}|| _ ||jd ddS r*   )r/   r,   r+   r-   r.   )rU   r0   r/   r'   r'   r(   _get_embed_positions   s
    z"GPTJAttention._get_embed_positionsF.hidden_states
layer_pastrk   r0   rl   	use_cacheoutput_attentionsr   c                 C   s  |  |}| |}	| |}
| || j| jd}| |	| j| jd}	| |
| j| jd}
t|sltj	 rzt
| j|}n
| |}|ddd|jd }t|d|}tj||jd d dd\}}| jd k	r|	d d d d d d d | jf }|	d d d d d d | jd f }|d d d d d d d | jf }|d d d d d d | jd f }t|||}t|||}tj||gdd}	tj||gdd}nt|	||}	t|||}|	dddd}	|dddd}|d k	r|d }|d }tj||	fd	d}	tj||
fd	d}
|dkr&|	|
f}nd }| ||	|
||\}}| || j| j}| |}| |}||f}|r~||f7 }|S )
NTFr3   r   r   r    r   r	   r4   )rR   rP   rQ   ra   rI   rJ   r   r!   Zjit
is_tracingr1   r/   rp   	unsqueezer-   r.   gathersplitrT   r:   r$   r^   ro   rc   rS   rF   )rU   rr   rs   rk   r0   rl   rt   ru   rh   ri   rj   r/   Zrepeated_position_idsZsincosr%   r&   Zk_rotZk_passZq_rotZq_passZpast_keyZ
past_valueZpresentrn   rm   outputsr'   r'   r(   forward   sR    



""""





zGPTJAttention.forward)NN)NNNNFF)__name__
__module____qualname__r@   ra   rc   ro   rp   r!   FloatTensorr   r   Tensor
LongTensorrA   r   r{   __classcell__r'   r'   rW   r(   r;   R   s6   "  
*
      "r;   c                       s4   e Zd Z fddZeej ejdddZ  ZS )GPTJMLPc                    sJ   t    |j}t||| _t||| _t|j | _	t
|j| _d S N)r?   r@   n_embdr   rO   fc_infc_outr
   Zactivation_functionactrC   rE   dropout)rU   Zintermediate_sizerV   rH   rW   r'   r(   r@     s    
zGPTJMLP.__init__)rr   r   c                 C   s,   |  |}| |}| |}| |}|S r   )r   r   r   r   )rU   rr   r'   r'   r(   r{     s
    



zGPTJMLP.forward)	r|   r}   r~   r@   r   r!   r   r{   r   r'   r'   rW   r(   r     s   
r   c                       s   e Zd Z fddZd	eej eeej  eej eej	 eej ee
 ee
 eeej eeejeejdf f  f dddZ  ZS )
	GPTJBlockc                    sR   t    |jd k	r|jnd|j }tj|j|jd| _t|| _	t
||| _d S )NrZ   Zeps)r?   r@   Zn_innerr   r   	LayerNormlayer_norm_epsilonln_1r;   attnr   mlp)rU   rV   Z	inner_dimrW   r'   r(   r@   "  s
    

zGPTJBlock.__init__NF.rq   c              	   C   sv   |}|  |}| j|||||||d}	|	d }
|	dd  }| |}|
| | }|r`|f| }n|f|dd   }|S )Nrr   rs   rk   r0   rl   rt   ru   r   r   )r   r   r   )rU   rr   rs   rk   r0   rl   rt   ru   ZresidualZattn_outputsrn   rz   Zfeed_forward_hidden_statesr'   r'   r(   r{   )  s&    

	
zGPTJBlock.forward)NNNNFF)r|   r}   r~   r@   r   r!   r   r   r   r   rA   r   r{   r   r'   r'   rW   r(   r   !  s"   
      (r   c                       sL   e Zd ZdZeZdZdZdZdgZ	dZ
 fddZdd	 ZdddZ  ZS )GPTJPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    transformerTr   past_key_valuesc                    s   t  j|| d S r   )r?   r@   )rU   inputskwargsrW   r'   r(   r@   Y  s    zGPTJPreTrainedModel.__init__c                 C   s   t |tjfr<|jjjd| jjd |jdk	r|jj	  nft |tj
r||jjjd| jjd |jdk	r|jj|j 	  n&t |tjr|jj	  |jjd dS )zInitialize the weights.g        )ZmeanZstdNr   )
isinstancer   rO   weightdataZnormal_rV   Zinitializer_ranger<   Zzero_	EmbeddingZpadding_idxr   Zfill_)rU   moduler'   r'   r(   _init_weights\  s    

z!GPTJPreTrainedModel._init_weightsFc                 C   s   t |tr||_d S r   )r   	GPTJModelgradient_checkpointing)rU   r   rj   r'   r'   r(   _set_gradient_checkpointingl  s    
z/GPTJPreTrainedModel._set_gradient_checkpointing)F)r|   r}   r~   __doc__r   config_classZbase_model_prefixZis_parallelizableZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementr@   r   r   r   r'   r'   rW   r(   r   L  s   r   aG  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`GPTJConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a9
  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_attention_heads,)` or `(n_layer, num_attention_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    This is an experimental feature and is a subject to change at a moment's notice. Uses a device map to distribute
    attention modules of the model across several devices. If no device map is given, it will evenly distribute blocks
    across all devices.

    Args:
        device_map (`Dict[int, list]`, optional, defaults to None):
            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
            automatically mapped to the first device (for esoteric reasons). That means that the first device should
            have fewer attention modules mapped to it than other devices. For reference, the GPT-J models have the
            following number of attention modules:

                - gpt-j-6B: 28

    Example:

    ```python
    # Here is an example of a device map on a machine with 4 GPUs using gpt-j-6B, which has a total of 28 attention modules:
    model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
    device_map = {
        0: [0, 1, 2, 3, 4, 5, 6],
        1: [7, 8, 9, 10, 11, 12, 13],
        2: [14, 15, 16, 17, 18, 19, 20],
        3: [21, 22, 23, 24, 25, 26, 27],
    }
    model.parallelize(device_map)
    ```
a6  
    Moves the model to CPU from a model parallel state.

    Example:

    ```python
    # On a 4 GPU machine with gpt-j-6B:
    model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
    device_map = {
        0: [0, 1, 2, 3, 4, 5, 6],
        1: [7, 8, 9, 10, 11, 12, 13],
        2: [14, 15, 16, 17, 18, 19, 20],
        3: [21, 22, 23, 24, 25, 26, 27],
    }
    model.parallelize(device_map)  # Splits the model across several devices
    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
    ```
z_The bare GPT-J Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Z fddZeedddZeedd Zdd	 Z	d
d Z
eedeeeeeddeej eeeej   eej eej eej eej eej ee ee ee ee eeef dddZ  ZS )r   c                    s   t     j| _ j| _t j| j| _t j	| _
t fddt jD | _tj| j jd| _d| _d | _d| _|   d S )Nc                    s   g | ]}t  qS r'   )r   ).0_rV   r'   r(   
<listcomp>  s     z&GPTJModel.__init__.<locals>.<listcomp>r   F)r?   r@   r   rH   
vocab_sizer   r   wterC   Z
embd_pdropdropZ
ModuleListrangen_layerhr   r   ln_fmodel_parallel
device_mapr   	post_initrU   rV   rW   r   r(   r@     s     zGPTJModel.__init__Nc                 C   s   t dt |d kr.tt| jttj	 n|| _
t| j
t| j d| _d| j
 kr^dndtt| j
  | _dtt| j
  | _| j| j| _| j
 D ]4\}}|D ]&}dt| }| j| || j|< qq| j| j| _d S )Na6  `GPTJModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance {'h.0': 0, 'h.1': 1, ...}Tcpucuda:)warningswarnFutureWarningr   r]   r   r   r!   cudadevice_countr   r   r   keysstrrf   first_devicemaxlast_devicer   r+   itemsr   )rU   r   kvblockZcuda_devicer'   r'   r(   parallelize  s     	$*zGPTJModel.parallelizec                 C   sx   t dt d| _d | _d| _d| _| jd| _t	t
| jD ]}| j| d| j|< q@| jd| _tj  d S )N\Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.Fr   )r   r   r   r   r   r   r   r   r+   r   r]   r   r   r!   r   empty_cache)rU   indexr'   r'   r(   deparallelize  s    zGPTJModel.deparallelizec                 C   s   | j S r   r   rU   r'   r'   r(   get_input_embeddings  s    zGPTJModel.get_input_embeddingsc                 C   s
   || _ d S r   r   rU   Znew_embeddingsr'   r'   r(   set_input_embeddings"  s    zGPTJModel.set_input_embeddingsbatch_size, sequence_length
checkpointoutput_typer   Zreal_checkpoint)	input_idsr   rk   token_type_idsr0   rl   inputs_embedsrt   ru   output_hidden_statesreturn_dictr   c              
      s  d k	rn| j j|
d k	r |
n| j j}
d k	r4n| j j|d k	rH|n| j j}|d k	rj|d k	rjtdnd|d k	r| || | }|d|d }|j	d }n,|d k	r| d d }|j	d }ntd|d k	r|j
n|j
}|d k	r|d|d }|d k	r|d|d  }|d kr<d}td gt| j }n|d d d}|d krtj||d | tj|d}|dd|d }|d k	r|dkrtd||d}|d d d d d d f }|j| jd}d	| t| jj }| || j j}|d kr| |}| |d k	r2| |} |  |   d
|dd    df }| jr|| jr|r|td drdnd }rdnd }|
rdnd }tt| j|D ]r\}\}}| j r*tj!" j
 |d k	rt fdd|D }|d k	r| j
}t#|tj$r*| j
}|
r:| f }| jrz| jrzfdd}tj%j&&|| d |||| }n| ||||| d}|d  dkr||d f }r||rdnd f }| j r| j'( D ]B\}}||d krdt)| | j*kr dt)|d   qq| +   | |
rP| f }|sptdd  |||fD S t, |||dS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer3   r   z5You have to specify either input_ids or inputs_embedsr4   )r   r,   z$batch_size has to be defined and > 0r   r   )r3   r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr'   c                 3   s   | ]}|  jV  qd S r   )r+   r,   r   Z
past_state)rr   r'   r(   	<genexpr>  s     z$GPTJModel.forward.<locals>.<genexpr>c                    s    fdd}|S )Nc                     s    | f S r   r'   )r   )r   ru   rt   r'   r(   custom_forward  s    zHGPTJModel.forward.<locals>.create_custom_forward.<locals>.custom_forwardr'   )r   r   )ru   rt   )r   r(   create_custom_forward  s    z0GPTJModel.forward.<locals>.create_custom_forwardr   Tr   r   c                 s   s   | ]}|d k	r|V  qd S r   r'   )r   r   r'   r'   r(   r     s      )Zlast_hidden_stater   rr   
attentions)-rV   ru   r   rt   use_return_dictrK   Z%warn_if_padding_and_no_attention_maskr\   rB   r.   r,   longtupler]   r   r!   r"   rw   r+   r   re   rf   Zget_head_maskr   r   r   r   ZtrainingloggerZwarning_once	enumeratezipr   r   
set_devicer   r   utilsr   r   r   r   r   r   r   )rU   r   r   rk   r   r0   rl   r   rt   ru   r   r   Zinput_shape
batch_sizer,   Zpast_lengthZtoken_type_embedsZoutput_shapeZpresentsZall_self_attentionsZall_hidden_statesir   rs   r   rz   r   r   r'   )rr   ru   rt   r(   r{   %  s    













	

"


zGPTJModel.forward)N)NNNNNNNNNNN)r|   r}   r~   r@   r   PARALLELIZE_DOCSTRINGr   DEPARALLELIZE_DOCSTRINGr   r   r   r   GPTJ_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_REAL_CHECKPOINT_FOR_DOCr   r!   r   r   r   r   rA   r   r{   r   r'   r'   rW   r(   r     sN   
           
r   zK
    The GPT-J Model transformer with a language modeling head on top.
    c                       s   e Zd ZdgZ fddZeedddZeedd Z	d	d
 Z
dd ZdddZeedeeeeeddeej eeeej   eej eej eej eej eej eej ee ee ee ee eeef dddZeeeej  ejeeej  dddZ  Z S )GPTJForCausalLMzlm_head.weightc                    s@   t  | t|| _t|j|j| _d| _	d | _
|   d S NF)r?   r@   r   r   r   rO   r   r   lm_headr   r   r   r   rW   r'   r(   r@     s    
zGPTJForCausalLM.__init__Nc                 C   st   t dt |d kr0tt| jjttj	
 n|| _t| jt| jj | j| j | j| jj| _d| _d S )NaT  `GPTJForCausalLM.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance {'transformer.h.0': 0, 'transformer.h.1': 1, ...}T)r   r   r   r   r]   r   r   r   r!   r   r   r   r   r   r   r+   r   r   )rU   r   r'   r'   r(   r     s    	zGPTJForCausalLM.parallelizec                 C   sF   t dt | j  | jd| _| jd| _d| _tj	
  d S )Nr   r   F)r   r   r   r   r   r+   r   r   r!   r   r   r   r'   r'   r(   r     s    
zGPTJForCausalLM.deparallelizec                 C   s   | j S r   r   r   r'   r'   r(   get_output_embeddings  s    z%GPTJForCausalLM.get_output_embeddingsc                 C   s
   || _ d S r   r   r   r'   r'   r(   set_output_embeddings  s    z%GPTJForCausalLM.set_output_embeddingsc           	      K   s   | dd }|rD|d d df d}|d k	rD|d d df d}| dd }| dd }|d k	r|d kr| dd }||dkd |r|d d df d}|d k	r|d krd|i}nd|i}||| d	|||d
 |S )Nr   r3   rk   r0   r   r   r   r   rt   )r   rt   r0   rk   r   )getrw   r   ZcumsumZmasked_fill_update)	rU   r   r   r   r   r   rk   r0   Zmodel_inputsr'   r'   r(   prepare_inputs_for_generation  s0    

z-GPTJForCausalLM.prepare_inputs_for_generationr   r   r   r   rk   r   r0   rl   r   labelsrt   ru   r   r   r   c                 C   s2  |dk	r|n| j j}| j||||||||	|
||d}|d }| jrbtj| jj || j	j
j}| 	|tj}d}|dk	r||j}|dddddf  }|dddf  }t }||d|d|d}||j}|s|f|dd  }|dk	r|f| S |S t|||j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N
r   rk   r   r0   rl   r   rt   ru   r   r   r   .r3   r   losslogitsr   rr   r   )rV   r   r   r   r!   r   r   r   r+   r   r   r,   rM   rb   r   rB   r\   r   r   r   rr   r   )rU   r   r   rk   r   r0   rl   r   r   rt   ru   r   r   transformer_outputsrr   Z	lm_logitsr   Zshift_logitsZshift_labelsloss_fctoutputr'   r'   r(   r{   9  sJ    zGPTJForCausalLM.forward)r   beam_idxr   c                    s   t  fdd| D S )a  
        This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
        [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        c                 3   s$   | ]}t  fd d|D V  qdS )c                 3   s"   | ]}| d  |jV  qdS )r   N)Zindex_selectr+   r,   r   r   r'   r(   r     s     z;GPTJForCausalLM._reorder_cache.<locals>.<genexpr>.<genexpr>Nr   )r   rs   r   r'   r(   r     s   z1GPTJForCausalLM._reorder_cache.<locals>.<genexpr>r   )r   r   r'   r   r(   _reorder_cache  s    	zGPTJForCausalLM._reorder_cache)N)NN)NNNNNNNNNNNN)!r|   r}   r~   Z_tied_weights_keysr@   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r!   r   r   r   r   rA   r   r{   staticmethodr   r   r'   r'   rW   r(   r     s`   

$            
I r   a  
    The GPT-J Model transformer with a sequence classification head on top (linear layer).

    [`GPTJForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT, GPT-2, GPT-Neo) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                       s   e Zd Z fddZeededee	e
dd
eej eeeej   eej eej eej eej eej eej ee ee ee ee eeef ddd	Z  ZS )GPTJForSequenceClassificationc                    sL   t  | |j| _t|| _tj|j| jdd| _d| _	d | _
|   d S )NFr>   )r?   r@   
num_labelsr   r   r   rO   r   scorer   r   r   r   rW   r'   r(   r@     s    
z&GPTJForSequenceClassification.__init__r   z4ydshieh/tiny-random-gptj-for-sequence-classificationr   Nr   c                 C   s@  |dk	r|n| j j}| j||||||||	|
||d}|d }| |}|dk	rZ|jd }n
|jd }| j jdkr|dkrtd| j jdkrd}nH|dk	rt|| j j	 
dd |j}nd}t| jj d |tj||jd|f }d}|dk	r||j}| j jdkrj| jdkr0d	| j _n:| jdkrb|jtj	ksX|jtjkrbd
| j _nd| j _| j jd	krt }| jdkr|| | }n
|||}nN| j jd
krt }||d| j|d}n| j jdkrt }|||}|s(|f|dd  }|dk	r$|f| S |S t|||j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r3   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`)r,   Z
regressionZsingle_label_classificationZmulti_label_classificationr   )rV   r   r   r  r.   Zpad_token_idrK   r!   eqr   Zargmaxr+   r,   r   warningrX   r|   r"   Zproblem_typer  r   intr   squeezer   rB   r   r   r   rr   r   )rU   r   r   rk   r   r0   rl   r   r   rt   ru   r   r   r   rr   r   r   Zsequence_lengthsZpooled_logitsr   r   r   r'   r'   r(   r{     s|    



(

z%GPTJForSequenceClassification.forward)NNNNNNNNNNNN)r|   r}   r~   r@   r   r   r   r   r   r   r   r   r!   r   r   r   r   rA   r   r{   r   r'   r'   rW   r(   r    sF               
r  z
    The GPT-J Model transformer with a span classification head on top for extractive question-answering tasks like
    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       s   e Zd Z fddZeedeee	e
edd	eej eej eej eej eej eej eej eej ee ee ee eee	f dddZ  ZS )
GPTJForQuestionAnsweringc                    sH   t  | |j| _t|| _t|j|j| _d| _	d | _
|   d S r   )r?   r@   r  r   r   r   rO   rG   
qa_outputsr   r   r   r   rW   r'   r(   r@   &  s    
z!GPTJForQuestionAnswering.__init__r   r   N)r   rk   r   r0   rl   r   start_positionsend_positionsru   r   r   r   c                 C   s`  |dk	r|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d }|d }d}|dk	r|dk	rt| dkr|d	|j
}t| dkr|d	|j
}|d}|d|}|d|}t|d}|||}|||}|| d }|sJ||f|dd  }|dk	rF|f| S |S t||||j|jd	S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        N)rk   r   r0   rl   r   ru   r   r   r   r   r3   r    )Zignore_indexr   )r   start_logits
end_logitsrr   r   )rV   r   r   r	  ry   r  rb   r]   r\   r+   r,   clampr   r   rr   r   )rU   r   rk   r   r0   rl   r   r
  r  ru   r   r   rz   Zsequence_outputr   r  r  Z
total_lossZignored_indexr   Z
start_lossZend_lossr   r'   r'   r(   r{   3  sP    




z GPTJForQuestionAnswering.forward)NNNNNNNNNNN)r|   r}   r~   r@   r   r   r   r   r   r   r   r   r   r!   r   r   rA   r   r   r{   r   r'   r'   rW   r(   r    sB              
r  )>r   r   typingr   r   r   r!   Ztorch.fxZtorch.utils.checkpointr   Ztorch.nnr   r   r   Zactivationsr
   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   r   r   r   r   r   r   Zutils.model_parallel_utilsr   r   Zconfiguration_gptjr   Z
get_loggerr|   r   r   r   r   Z"GPTJ_PRETRAINED_MODEL_ARCHIVE_LISTr  r   r)   Zfxwrapr1   r8   r:   Moduler;   r   r   r   ZGPTJ_START_DOCSTRINGr   r   r   r   r   r  r  r'   r'   r'   r(   <module>   sn   

 =+%1 z 8w