U
    ,-eE                 	   @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZmZmZmZ d dlZd dlmZ d dlmZmZ d dlm Z m!Z! d	d
l"m#Z# d	dl$m%Z% d	dl&m'Z' d	dl(m)Z)m*Z* d	dl+m,Z,m-Z-m.Z. d	dl/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6 d	dl7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZSmTZTmUZUmVZVmWZW d	dlXmYZYmZZZ d	dl[m\Z\m]Z]m^Z^m_Z_ d	dl`maZambZbmcZc d	dldmeZe ejfgddh Ziejfgddh ZjeK rd dlkmlZlmmZmmnZn d dlompZp d dlqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZy eS rd dlzm{Z{ d dl|m}Z~ d dl|mZ eUeZdadd  Zd!d" Ze] rd dlm  mZ d d#lmZ eeed$kZnd%ZeQ rd	d&l7mZ edgd'd(Zeeje*d)f d*d+d,Zeeje*d)f d*d-d.Zeeje*d)f d*d/d0Zd1d2 Zd3d4 Zd5d6 Zd7eBfeeejf eeef ed8d9d:Zdhd;d<Zeeejf d=d>d?Zd@dA ZdBdC ZdDdE ZdFdG ZdidHdIZdjeee edJdKdLZG dMd) d)ZG dNdO dOejee*eEe,ZeGeje_ejjdk	rzejjjdPdQdRdSej_G dTdU dUejZG dVdW dWejZG dXdY dYejZeG dZd[ d[eDZG d\d] d]ejZG d^d_ d_ejZejejd`dadbZdcdd Zdedf ZdS )k    N)contextmanager)	dataclass)partialwraps)AnyCallableDictListOptionalTupleUnion)version)Tensornn)CrossEntropyLossIdentity   )get_activation)PretrainedConfig)custom_object_save)GenerationConfigGenerationMixin)PeftAdapterMixindeepspeed_configis_deepspeed_zero3_enabled)Conv1Dapply_chunking_to_forward find_pruneable_heads_and_indicesid_tensor_storageprune_conv1d_layerprune_layerprune_linear_layer) ADAPTER_SAFE_WEIGHTS_NAMEADAPTER_WEIGHTS_NAMECONFIG_NAMEDUMMY_INPUTSFLAX_WEIGHTS_NAMESAFE_WEIGHTS_INDEX_NAMESAFE_WEIGHTS_NAMETF2_WEIGHTS_NAMETF_WEIGHTS_NAMEWEIGHTS_INDEX_NAMEWEIGHTS_NAMEContextManagersModelOutputPushToHubMixincached_file	copy_funcdownload_urlextract_commit_hashhas_fileis_accelerate_availableis_auto_gptq_availableis_bitsandbytes_availableis_flash_attn_availableis_offline_modeis_optimum_availableis_peft_availableis_remote_urlis_safetensors_availableis_torch_tpu_availableloggingreplace_return_docstrings	strtobool)convert_file_size_to_intget_checkpoint_shard_files)ENV_VARS_TRUE_VALUESis_sagemaker_mp_enabledis_torch_fx_proxyis_torchdynamo_compiling)BitsAndBytesConfig
GPTQConfigQuantizationMethod)require_version_coreXLA_USE_BF160XLA_DOWNCAST_BF16)dispatch_modelinfer_auto_device_mapinit_empty_weights)add_hook_to_module)$check_tied_parameters_on_same_devicefind_tied_parametersget_balanced_memoryget_max_memoryload_offloaded_weightsoffload_weightsave_offload_indexset_module_tensor_to_device)	safe_open)	load_file)	save_fileTc                   C   s*   t j o(t j o(ttjdddkS )NZACCELERATE_USE_FSDPFalser   )torchdistributedis_availableis_initializedrA   osenvironget rf   rf   \/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/modeling_utils.pyis_fsdp_enabled{   s
    
rh   c                   C   s   t  ottjdddkS )NZ
LOCAL_RANKr   )rh   intrc   rd   re   rf   rf   rf   rg   is_fsdp_enabled_and_dist_rank_0   s    rk   __version__z1.10F)find_adapter_config_filec                 c   s"   t }| rda z
dV  W 5 |a X dS )z
    Context manager to globally disable weight initialization to speed up loading large models.

    TODO(Patrick): Delete safety argument `_enable=True` at next major version. .
    FN)_init_weights)_enableZold_init_weightsrf   rf   rg   no_init_weights   s    
rq   ModuleUtilsMixin)	parameterc                 C   sf   zt |  jW S  tk
r`   tjtttt	f  ddd}| j
|d}t |}|d j Y S X d S )Nmodulereturnc                 S   s   dd | j  D }|S )Nc                 S   s"   g | ]\}}t |r||fqS rf   r_   Z	is_tensor.0kvrf   rf   rg   
<listcomp>   s     
 zHget_parameter_device.<locals>.find_tensor_attributes.<locals>.<listcomp>__dict__itemsru   Ztuplesrf   rf   rg   find_tensor_attributes   s    z4get_parameter_device.<locals>.find_tensor_attributesZget_members_fnr   )next
parametersdeviceStopIterationr   Moduler	   r   strr   _named_membersrs   r   genZfirst_tuplerf   rf   rg   get_parameter_device   s    r   c                 C   sf   zt |  jW S  tk
r`   tjtttt	f  ddd}| j
|d}t |}|d j Y S X dS )z`
    Returns the first parameter dtype (can be non-floating) or asserts if none were found.
    rt   c                 S   s   dd | j  D }|S )Nc                 S   s"   g | ]\}}t |r||fqS rf   rw   rx   rf   rf   rg   r|      s     
 zMget_first_parameter_dtype.<locals>.find_tensor_attributes.<locals>.<listcomp>r}   r   rf   rf   rg   r      s    z9get_first_parameter_dtype.<locals>.find_tensor_attributesr   r   N)r   r   dtyper   r   r   r	   r   r   r   r   r   rf   rf   rg   get_first_parameter_dtype   s    r   c                 C   s  d}|   D ]n}|j}| rttkr6t r6tj  S ttkrpt rp|jtj	krZtj  S |jtj
krptj  S |j  S q|dk	r|S tjttttf  ddd}| j|d}d}|D ]"}|}|d  r|d j  S q|dk	r|d jS |  D ]}|j}| r|j  S q|S )zz
    Returns the first found floating dtype in parameters if there is one, otherwise returns the last dtype it found.
    Nrt   c                 S   s   dd | j  D }|S )Nc                 S   s"   g | ]\}}t |r||fqS rf   rw   rx   rf   rf   rg   r|      s     
 zGget_parameter_dtype.<locals>.find_tensor_attributes.<locals>.<listcomp>r}   r   rf   rf   rg   r      s    z3get_parameter_dtype.<locals>.find_tensor_attributesr   r   )r   r   is_floating_pointrL   rD   r>   r_   bfloat16rN   floatdoublefloat32r   r   r	   r   r   r   r   buffers)rs   Z
last_dtypetr   r   Z
last_tupletuplerf   rf   rg   get_parameter_dtype   s8    



r   c                 C   s,   |   D ]}| r|j  S qtddS )z_
    Returns the first found floating dtype in `state_dict` or asserts if none were found.
    z5couldn't find any floating point dtypes in state_dictN)valuesr   r   
ValueError
state_dictr   rf   rf   rg   get_state_dict_float_dtype   s    r   c                 C   s.   |   D ]}| r|j  S qt|   jS )zt
    Returns the first found floating dtype in `state_dict` if there is one, otherwise returns the first dtype.
    N)r   r   r   r   r   rf   rf   rg   get_state_dict_dtype  s    r   c                 C   sN   | t jkrdS tdt| }|dkr6td|  dt| d }|d S )z
    Returns the size (in bytes) occupied by one parameter of type `dtype`.

    Example:

    ```py
    >>> dtype_byte_size(torch.float32)
    4
    ```
    g      ?z[^\d](\d+)$Nz`dtype` is not a valid dtype: .r      )r_   boolresearchr   r   rj   groups)r   Z
bit_searchZbit_sizerf   rf   rg   dtype_byte_size  s    
r   10GB)r   max_shard_sizeweights_namec              	   C   s  t |}i g}d}d}i }|  D ]\}}t|tr8q"nt|}	|	|kr^||	 }
|||
 |< q"| t|j }|| |krt|d dkr|	i  d}||d |< ||7 }||7 }t|d ||	< q"t|dkr||d idfS i }i }t
|D ]t\}}|dd|d ddt|dd}|d	d|d ddt|dd	}|||< | D ]}|||< qXqd
|i}||d}||fS )a  
    Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a
    given size.

    The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no
    optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the
    limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB],
    [6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB].

    <Tip warning={true}>

    If one of the model's weight is bigger than `max_shard_size`, it will end up in its own sub-checkpoint which will
    have a size greater than `max_shard_size`.

    </Tip>

    Args:
        state_dict (`Dict[str, torch.Tensor]`): The state dictionary of a model to save.
        max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
            The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit
            (like `"5MB"`).
        weights_name (`str`, *optional*, defaults to `"pytorch_model.bin"`):
            The name of the model save file.
    r   ri   r   N.bin-Z05dz-of-.safetensors
total_size)metadata
weight_map)rB   r   
isinstancer   r   numelr   r   lenappend	enumeratereplacekeys)r   r   r   Zsharded_state_dictsZlast_block_sizer   Zstorage_id_to_blockkeyweightZ
storage_idZblock_idZweight_sizer   shardsidxshard
shard_filer   indexrf   rf   rg   shard_checkpoint"  sJ    

& 
r   c              	      s"  t j|t}t j|t}t j|}t j|}|sr|rBt srt rPttfntf}tdd| d| dd}	|r|rt rd}	qt	d| d n|sd}	|	r|n|}
t
|
d	d
d}t|}W 5 Q R X tt|d  }|d   |    fddD }fdd D }|rt|dksHt|dkrd| jj }t|dkrddd |D }|d| d7 }t|dkrddd |D }|d| d7 }t||	rtnttjdd}|D ]0}|t j||}| j|dd ~t  qtjjj||S )a  
    This is the same as
    [`torch.nn.Module.load_state_dict`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=load_state_dict#torch.nn.Module.load_state_dict)
    but for a sharded checkpoint.

    This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being
    loaded in the model.

    Args:
        model (`torch.nn.Module`): The model in which to load the checkpoint.
        folder (`str` or `os.PathLike`): A path to a folder containing the sharded checkpoint.
        strict (`bool`, *optional`, defaults to `True`):
            Whether to strictly enforce that the keys in the model state dict match the keys in the sharded checkpoint.
        prefer_safe (`bool`, *optional*, defaults to `False`)
            If both safetensors and PyTorch save files are present in checkpoint and `prefer_safe` is True, the
            safetensors files will be loaded. Otherwise, PyTorch files are always loaded when possible.

    Returns:
        `NamedTuple`: A named tuple with `missing_keys` and `unexpected_keys` fields
            - `missing_keys` is a list of str containing the missing keys
            - `unexpected_keys` is a list of str containing the unexpected keys
    zCan't find a checkpoint index ( or z) in r   FTz"Cannot load sharded checkpoint at z+ safely since safetensors is not installed!rutf-8encodingr   c                    s   g | ]}| kr|qS rf   rf   ry   r   )loaded_keysrf   rg   r|     s      z+load_sharded_checkpoint.<locals>.<listcomp>c                    s   g | ]}| kr|qS rf   rf   r   )
model_keysrf   rg   r|     s      r   #Error(s) in loading state_dict for ,c                 S   s   g | ]}d | d qS "rf   ry   rz   rf   rf   rg   r|     s     z
Missing key(s): c                 S   s   g | ]}d | d qS r   rf   r   rf   rf   rg   r|     s     cpumap_location)strict) rc   pathjoinr+   r'   isfiler=   r   loggerwarningopenjsonloadlistsetr   r   r   r   	__class____name__RuntimeErrorsafe_load_filer   r_   load_state_dictgccollectr   modulesru   Z_IncompatibleKeys)modelfolderr   Zprefer_safeZ
index_fileZsafe_index_fileZindex_presentZsafe_index_present	filenamesZ	load_safeZ
load_indexfr   Zshard_filesmissing_keysunexpected_keyserror_messageZstr_missing_keysZstr_unexpected_keysloaderr   r   rf   )r   r   rg   load_sharded_checkpointu  sT    
"r   )checkpoint_filec                 C   sT  |  drxt rxt| dd}| }W 5 Q R X |ddkrPtd|  dn |d dkrptd|d  d	t| S z>t st	 rt
j rt
j d
krd}nd}t
j| |dW S  tk
rN } zxzBt| 0}|ddkrtdntd|  d|W 5 Q R X W n0 ttfk
r<   td|  d|  dY nX W 5 d}~X Y nX dS )z]
    Reads a PyTorch checkpoint file, returning properly formatted errors if they arise.
    r   pt)	frameworkformat)r   tfZflaxz"The safetensors archive passed at zf does not contain the valid metadata. Make sure you save your model with the `save_pretrained` method.zConversion from a z7 safetensors archive to PyTorch is not implemented yet.r   metar   r      r   zYou seem to have cloned a repository without having git-lfs installed. Please install git-lfs and run `git lfs install` followed by `git lfs pull` in the folder you cloned.zUnable to locate the file z_ which is necessary to load this pretrained model. Make sure you have saved the model properly.z9Unable to load weights from pytorch checkpoint file for 'z' at 'zZ'. If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True.N)endswithr=   r[   r   re   OSErrorNotImplementedErrorr   r   rh   r_   r`   rb   get_rankr   	Exceptionr   readr   UnicodeDecodeError)r   r   r   r   erf   rf   rg   r     sP    


r   c                    sN   |   D ]@\ } fdd|D }tt|  t| dkrd|_qdS )z
    Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state
    dict.
    c                    s.   g | ]&}|   d r|  d dqS r    )
startswithr   r   module_namerf   rg   r|     s      z.set_initialized_submodules.<locals>.<listcomp>r   TN)named_modulesr   r   r   r   _is_hf_initialized)r   Zstate_dict_keysru   r   rf   r   rg   set_initialized_submodules  s     r   c                    s   g }g }|  D ]H}d }d|kr,|dd}d|kr@|dd}|r|| || qt||D ]\}}||||< qdt|dd | }d k	r|_g  dtj	d fdd	| ||d
 ~ S )Ngammar   betabias	_metadatar   )ru   c           	   	      s
  d kri n d d i }||dg g f}tfdd|D dkrt rdd l}t| jd d dd  fdd| D }t|dkr|jj|dd	 t	j
 dkr| j|  W 5 Q R X n
| j|  | j D ]$\}}|d k	r||| d
  qd S )Nri   Tc                    s   g | ]}|  r|qS rf   r   r   prefixrf   rg   r|   %  s     
 z=_load_state_dict_into_model.<locals>.load.<locals>.<listcomp>r   F)r  recursec                    s   g | ]}| kr | qS rf   rf   r   )named_parametersrf   rg   r|   ,  s      Zmodifier_rankr   )re   r   r   	deepspeeddictr  r   zeroGatheredParametersr_   r`   r   Z_load_from_state_dict_modulesr   )	ru   r   r  Zlocal_metadataargsr
  Zparams_to_gathernamechild
error_msgsr   r   )r  r  rg   r      s     
z)_load_state_dict_into_model.<locals>.loadr  )r   )
r   r   r   zippopgetattrcopyr  r   r   )model_to_loadr   start_prefixold_keysnew_keysr   new_keyold_keyrf   r  rg   _load_state_dict_into_model  s,    
r  c                 C   s   t |dkr.||r.d|ddd }|d}| }t |dkrtt||d rlt||d }|d= q<d}qtq<|| krd}||d fS )z
    A helper util to find the last sub-module and the param/buffer name. If `start_prefix` is supplied it'll be removed
    from the start of the key
    r   r   r   N)r   r   r   splithasattrr  )r   Zlong_keyr  Z	split_key	submodulerf   rf   rg   find_submodule_and_param_nameC  s    
r"  c                 C   sh   |D ]^}t | ||\}}|dk	rt||}t|tjjrLtj|d}n
|d}t||| qdS )z
    Moves `loaded_state_dict_keys` in model to meta device which frees up the memory taken by those params.

    `start_prefix` is used for models which insert their name into model keys, e.g. `bert` in
    `bert.pooler.dense.weight`

    Nr   )r"  r  r   r_   r   	Parametertosetattr)r   loaded_state_dict_keysr  rz   r!  
param_namenew_valrf   rf   rg   _move_model_to_metaZ  s    

r)  c                    s  |rddl m} g }g }g }| D ]H}d}d|kr@|dd}d|krT|dd}|r$|| || q$t||D ]\}}||||< qx| D ]2\ } |ks |krq |rΈ t	|d   }i }|
dk	rRt
|rR|dk	rHt fdd	|D rH|
t
jkrH|t
j}d
tttjkrRt
j|d
< n
||
}|
dkr| } d}|D ] }t||}|dkrn qqn|dk	r||j}||d< |dkrd}n^t	|dkr||krd|ddd }q|dkrd|krt  d|| }|dkr<|st| ||}q|dkr`|	dk	r`t| ||	}	q|sxt|  |f| q|jt
jkr dd| kr| dd }nd}d kr||  |||d q|||	fS )a  
    This is somewhat similar to `_load_state_dict_into_model`, but deals with a model that has some or all of its
    params on a `meta` device. It replaces the model params with the data from the `state_dict`, while moving the
    params back to the normal device, but only for `loaded_state_dict_keys`.

    `start_prefix` is used for models which insert their name into model keys, e.g. `bert` in
    `bert.pooler.dense.weight`

    r   %set_module_quantized_tensor_to_deviceNr   r   r  r  c                 3   s   | ]}| kV  qd S Nrf   ry   Zmodule_to_keep_in_fp32r'  rf   rg   	<genexpr>  s     z3_load_state_dict_into_meta_model.<locals>.<genexpr>r   r   valuer   r   ri   r   z doesn't have any device set.diskSCB)r0  fp16_statistics)integrationsr+  r   r   r   r  r  r   r   r   r_   r   anyfloat16r$  r   r   inspect	signaturerZ   r   r  r  r   r   r   rX   int8)r   r   r&  r  expected_keys
device_mapoffload_folderoffload_indexstate_dict_folderstate_dict_indexr   is_quantizedis_safetensorskeep_in_fp32_modulesr+  r  r  r  r   r  r  paramr   Zset_module_kwargs	old_paramsplitsr  Zparam_devicer3  rf   r.  rg    _load_state_dict_into_meta_modelt  s    !










$    rF  )r   variantrv   c                 C   s>   |d k	r:|  d}|d d |g |dd   }d|} | S )Nr   ri   )r  r   )r   rG  rE  rf   rf   rg   _add_variant  s
    

rH  c                   @   s  e Zd ZdZedd Zedd Zdd Zdd	 Ze	e
jd
ddZe	e
jd
ddZeedddZed(ddZd)eee e
je
jedddZd*ee eeedddZdd Zd+eeeddd Zeeee
jef f ed!d"d#Zd,eeee
jef f eed%d&d'ZdS )-rr   zH
    A few utilities for `torch.nn.Modules`, to be used as a mixin.
    c                 O   sL   zdd l }W n tk
r(   tdY nX |t }| }|j| _d S )Nr   FYou need to install psutil (pip install psutil) to use memory tracing.)psutilImportErrorProcessrc   getpidmemory_inforssmem_rss_pre_forward)ru   r  kwargsrJ  processmemrf   rf   rg   _hook_rss_memory_pre_forward  s    z-ModuleUtilsMixin._hook_rss_memory_pre_forwardc                 O   sr   zdd l }W n tk
r(   tdY nX |t }| }|j| _| j| j }|t	| drf| j
nd | _
d S )Nr   rI  mem_rss_diff)rJ  rK  rL  rc   rM  rN  rO  mem_rss_post_forwardrP  r   rU  )ru   r  rQ  rJ  rR  rS  rU  rf   rf   rg   _hook_rss_memory_post_forward  s    z.ModuleUtilsMixin._hook_rss_memory_post_forwardc                 C   s2   |   D ]}|| j || j q|   dS )a%  
        Add a memory hook before and after each sub-module forward pass to record increase in memory consumption.

        Increase in memory consumption is stored in a `mem_rss_diff` attribute for each module and can be reset to zero
        with `model.reset_memory_hooks_state()`.
        N)r   Zregister_forward_pre_hookrT  register_forward_hookrW  reset_memory_hooks_stateselfru   rf   rf   rg   add_memory_hooks  s    z!ModuleUtilsMixin.add_memory_hooksc                 C   s$   |   D ]}d|_d|_d|_qdS )z
        Reset the `mem_rss_diff` attribute of each module (see [`~modeling_utils.ModuleUtilsMixin.add_memory_hooks`]).
        r   N)r   rU  rV  rP  rZ  rf   rf   rg   rY  (  s    z)ModuleUtilsMixin.reset_memory_hooks_staterv   c                 C   s   t | S )z
        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
        device).
        )r   r[  rf   rf   rg   r   1  s    zModuleUtilsMixin.devicec                 C   s   t | S )zw
        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
        )r   r^  rf   rf   rg   r   9  s    zModuleUtilsMixin.dtype)encoder_attention_maskrv   c                 C   st   |  dkr(|dddddddf }|  dkrL|ddddddf }|j| jd}d| t| jj }|S )z
        Invert an attention mask (e.g., switches 0. and 1.).

        Args:
            encoder_attention_mask (`torch.Tensor`): An attention mask.

        Returns:
            `torch.Tensor`: The inverted attention mask.
           N   r         ?)dimr$  r   r_   finfomin)r[  r_  Zencoder_extended_attention_maskrf   rf   rg   invert_attention_mask@  s    
z&ModuleUtilsMixin.invert_attention_maskNc           	      C   s   |d k	rt dt n|j}| \}}tj||d}|d d d d f ||d|d d d d f k}||j}|j	d |j	d k r|j	d |j	d  }tj
tj|||f||jd|gdd}|d d d d d d d f |d d d d d d f  }|S )NNThe `device` argument is deprecated and will be removed in v5 of Transformers.)r   r   r   r   ri   )Zaxis)warningswarnFutureWarningr   r_   Zarangerepeatr$  r   shapecatZones)	input_shapeattention_maskr   Z
batch_sizeZ
seq_lengthZseq_idsZcausal_maskZprefix_seq_lenextended_attention_maskrf   rf   rg   *create_extended_attention_mask_for_decoderX  s(     .4z;ModuleUtilsMixin.create_extended_attention_mask_for_decoder)rq  rp  r   r   rv   c                 C   s   |dkr| j }| dkr"| jjs6|dk	r6tdt | dkr`|dddddddf }nV| dkr| jjrt|||}q|ddddddf }nt	d| d|j
 d|j|d}d	| t|j }|S )
a  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`Tuple[int]`):
                The shape of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        Nra  rh  r`  z!Wrong shape for input_ids (shape z) or attention_mask (shape )rb  rc  )r   rd  configZ
is_decoderrj  rk  rl  rr   rs  r   rn  r$  r_   re  rf  )r[  rq  rp  r   r   rr  rf   rf   rg   get_extended_attention_maskt  s0       	z,ModuleUtilsMixin.get_extended_attention_maskF)	head_masknum_hidden_layersis_attention_chunkedrv   c                 C   s6   |dk	r(|  ||}|dkr2|d}n
dg| }|S )a  
        Prepare the head mask if needed.

        Args:
            head_mask (`torch.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
            num_hidden_layers (`int`):
                The number of hidden layers in the model.
            is_attention_chunked (`bool`, *optional*, defaults to `False`):
                Whether or not the attentions scores are computed by chunks or not.

        Returns:
            `torch.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
            `[None]` for each layer.
        NTri   )_convert_head_mask_to_5d	unsqueeze)r[  rw  rx  ry  rf   rf   rg   get_head_mask  s    
zModuleUtilsMixin.get_head_maskc                 C   s   |  dkr<|dddd}||dddd}n"|  dkr^|ddd}|  dks|td|   |j| jd}|S )zD-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]r   r   ri   ra     zhead_mask.dim != 5, instead rb  )rd  r{  expandAssertionErrorr$  r   )r[  rw  rx  rf   rf   rg   rz    s    z)ModuleUtilsMixin._convert_head_mask_to_5d)only_trainableexclude_embeddingsrv   c                    s   |r.dd |   D   fdd|  D }nt|  }g }t| dd}|rft r^ddl}ntd|D ]B}|jsx|sj|rt	||j
jr|| d	  qj||  qjt|S )
a  
        Get number of (optionally, trainable or non-embeddings) parameters in the module.

        Args:
            only_trainable (`bool`, *optional*, defaults to `False`):
                Whether or not to return only the number of trainable parameters

            exclude_embeddings (`bool`, *optional*, defaults to `False`):
                Whether or not to return only the number of non-embeddings parameters

        Returns:
            `int`: The number of parameters.
        c                 S   s&   g | ]\}}t |tjr| d qS )z.weight)r   r   	Embedding)ry   r  module_typerf   rf   rg   r|     s     z3ModuleUtilsMixin.num_parameters.<locals>.<listcomp>c                    s   g | ]\}}| kr|qS rf   rf   )ry   r  rs   Zembedding_param_namesrf   rg   r|     s     is_loaded_in_4bitFr   Nzbitsandbytes is not installed but it seems that the model has been loaded in 4bit precision, something went wrong make sure to install bitsandbytes with `pip install bitsandbytes`.ra  )r   r  r   r   r  r7   bitsandbytesr   Zrequires_gradr   r   Z
Params4bitr   r   sum)r[  r  r  Ztotal_parametersZtotal_numelr  ZbnbrC  rf   r  rg   num_parameters  s,    


zModuleUtilsMixin.num_parameters)
input_dictrv   c                 C   sJ   t | dsi | _| j|kr(|| j  S d| jkrFtd d| jd< dS )z
        Helper function to estimate the total number of tokens from the model inputs.

        Args:
            inputs (`dict`): The model inputs.

        Returns:
            `int`: The total number of tokens.
        warnings_issuedestimate_tokenszdCould not estimate the number of tokens of the input, floating-point operations will not be computedTr   )r   r  main_input_namer   r   r   )r[  r  rf   rf   rg   r    s    




z ModuleUtilsMixin.estimate_tokensT)r  r  rv   c                 C   s   d|  | | j|d S )a  
        Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a
        batch with this transformer model. Default approximation neglects the quadratic dependency on the number of
        tokens (valid if `12 * d_model << sequence_length`) as laid out in [this
        paper](https://arxiv.org/pdf/2001.08361.pdf) section 2.1. Should be overridden for transformers with parameter
        re-use e.g. Albert or Universal Transformers, or if doing long-range modeling with very high sequence lengths.

        Args:
            batch_size (`int`):
                The batch size for the forward pass.

            sequence_length (`int`):
                The number of tokens in each line of the batch.

            exclude_embeddings (`bool`, *optional*, defaults to `True`):
                Whether or not to count embedding and softmax operations.

        Returns:
            `int`: The number of floating-point operations.
           )r  )r  r  )r[  r  r  rf   rf   rg   floating_point_ops  s    z#ModuleUtilsMixin.floating_point_ops)N)NN)F)FF)T)r   
__module____qualname____doc__staticmethodrT  rW  r\  rY  propertyr_   r   r   r   rg  rs  r   rj   r   rv  r
   r   r|  rz  r  r   r   r   r   r  r  rf   rf   rf   rg   rr     sN   

	      5   /"  c                       s  e Zd ZdZdZdZdZdZdZdZ	dZ
dZdZdZdZdZdZdZeeeejf dddZeedd	d
Zed fddZdd Zdd Zedd ZeejejdddZ ee!j"dddZ#ee$dddZ%edte&ej e&e'eeee(f f  edddZ)dd Z*d d! Z+e!j"dd"d#Z,e!j"d$d%d&Z-e!j"dd'd(Z.d)d* Z/d+d, Z0d-d. Z1e2e!j"e!j"ed/d0d1Z3d2d3 Z4due&e( e&e( e!j5d4d5d6Z6dvd7d8Z7dwe!j5e&e( e&e( e!j5d9d:d;Z8dxe!j9e&e( e&e$ e!j9d<d=d>Z:d?d@ Z;e(dAdBdCZ<e'e!j5e=e!j5 f ddDdEZ>dFdG Z?ee(e@e( f dHdIdJZAdKdL ZBdMdN ZCee$ddOdPZDdQdejEddRddddQf	e'eeFjGf e$e&eH eIe$e'e(ef e$e&e e&e'ee$f  e$dS
dTdUZJdydVdWZKeLej!j"jM fdXdYZMeLej!j"jN fdZd[ZN fd\d]ZO fd^d_ZPeddddddd`ddae&e'eeFjGf  e&e'eeeFjGf  e&e'eeFjGf  e$e$e$e&e'ee$f  ee$db	dcddZQedzdedfZRd{dgdhZSe2d|didjZTed}dldmZUd ddndoZVdpdq ZWdrds ZX  ZYS )~PreTrainedModela  
    Base class for all models.

    [`PreTrainedModel`] takes care of storing the configuration of the models and handles methods for loading,
    downloading and saving models as well as a few methods common to all models to:

        - resize the input embeddings,
        - prune heads in the self-attention heads.

    Class attributes (overridden by derived classes):

        - **config_class** ([`PretrainedConfig`]) -- A subclass of [`PretrainedConfig`] to use as configuration class
          for this model architecture.
        - **load_tf_weights** (`Callable`) -- A python *method* for loading a TensorFlow checkpoint in a PyTorch model,
          taking as arguments:

            - **model** ([`PreTrainedModel`]) -- An instance of the model on which to load the TensorFlow checkpoint.
            - **config** ([`PreTrainedConfig`]) -- An instance of the configuration associated to the model.
            - **path** (`str`) -- A path to the TensorFlow checkpoint.

        - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived
          classes of the same architecture adding modules on top of the base model.
        - **is_parallelizable** (`bool`) -- A flag indicating whether this model supports model parallelization.
        - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP
          models, `pixel_values` for vision models and `input_values` for speech models).
    Nr   	input_idsFr]  c                 C   s   dt tiS )z^
        `Dict[str, torch.Tensor]`: Dummy inputs to do a forward pass in the network.
        r  )r_   tensorr%   r^  rf   rf   rg   dummy_inputsc  s    zPreTrainedModel.dummy_inputsc                 C   s   dS )z@
        :str: Identifies that this is a PyTorch model.
        r   rf   r^  rf   rf   rg   r   j  s    zPreTrainedModel.frameworkru  c                    sb   t    t|ts2td| jj d| jj d|| _|j| _i | _	| 
 rXt|nd | _d S )NzParameter config in `zt(config)` should be an instance of class `PretrainedConfig`. To create a model from a pretrained model use `model = z(.from_pretrained(PRETRAINED_MODEL_NAME)`)super__init__r   r   r   r   r   ru  name_or_pathr  can_generater   Zfrom_model_configgeneration_config)r[  ru  inputsrQ  r   rf   rg   r  q  s    

zPreTrainedModel.__init__c                 C   s   |    |   dS )z
        A method executed at the end of each Transformer model initialization, to execute code that needs the model's
        modules properly initialized (such as weight initialization).
        N)init_weights._backward_compatibility_gradient_checkpointingr^  rf   rf   rg   	post_init  s    zPreTrainedModel.post_initc                 C   s,   | j r(t| jddr(|   t| jd d S )Ngradient_checkpointingF)supports_gradient_checkpointingr  ru  gradient_checkpointing_enabledelattrr^  rf   rf   rg   r    s    z>PreTrainedModel._backward_compatibility_gradient_checkpointingc              	   K   s   | dd}d}|dk	r"| |}t rdddl}td |jjt d | |f|}W 5 Q R X n| |f|}|dk	rt	
| |S )z
        All context managers that the model should be initialized under go here.

        Args:
            torch_dtype (`torch.dtype`, *optional*):
                Override the default `torch.dtype` and load the model under this dtype.
        torch_dtypeNr   @Detected DeepSpeed ZeRO-3: activating zero.init() for this modelZconfig_dict_or_path)r  _set_default_torch_dtyper   r
  r   infor  Initr   r_   set_default_dtype)clsru  rQ  r  
dtype_origr
  r   rf   rf   rg   _from_config  s    	


zPreTrainedModel._from_config)r   rv   c                 C   sN   |j std| j d| dtd| j d| d t }t| |S )a  
        Change the default dtype and return the previous one. This is needed when wanting to instantiate the model
        under specific dtype.

        Args:
            dtype (`torch.dtype`):
                a floating dtype to set to.

        Returns:
            `torch.dtype`: the original `dtype` that can be used to restore `torch.set_default_dtype(dtype)` if it was
            modified. If it wasn't, returns `None`.

        Note `set_default_dtype` currently only works with floating-point types and asserts if for example,
        `torch.int64` is passed. So if a non-float `dtype` is passed this functions will throw an exception.
        zCan't instantiate z model under dtype=z' since it is not a floating point dtypezInstantiating z model under default dtype r   )r   r   r   r   r  r_   Zget_default_dtyper  )r  r   r  rf   rf   rg   r    s    
z(PreTrainedModel._set_default_torch_dtypec                 C   s   t | | j| S )z@
        `torch.nn.Module`: The main body of the model.
        )r  base_model_prefixr^  rf   rf   rg   
base_model  s    zPreTrainedModel.base_modelc                 C   s$   dt | jkr dt | jkr dS dS )z
        Returns whether this model can generate sequences with `.generate()`.

        Returns:
            `bool`: Whether this model can generate sequences with `.generate()`.
        r   FT)r   Zprepare_inputs_for_generationgenerater  rf   rf   rg   r    s    
zPreTrainedModel.can_generate)r  r;  rv   c                 C   s  | j stdt stdn2ttjd}|tdk}|sPtd| t| dd}|rhtd|d	kr|t	
d
 n(|d	k	r|tjtjfkrtd| d|d	krtj rt	
d qtdn2|d	k	rt|trd| ksd| krtdd|_|S )a  
        If you don't know about Flash Attention, check out the official repository of flash attention:
        https://github.com/Dao-AILab/flash-attention

        For using Flash Attention 1.0 you can do it directly via the `BetterTransformer` API, have a look at this
        specific section of the documentation to learn more about it:
        https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#decoder-models

        The method checks if the current setup is compatible with Flash Attention as it requires the model to be in
        half precision and not ran on CPU.

        If all checks pass, the method will create an attribute in the config `_flash_attn_2_enabled` so that the model
        can initialize the correct attention module
        zThe current architecture does not support Flash Attention 2.0. Please open an issue on GitHub to request support for this architecture: https://github.com/huggingface/transformers/issues/newzFlash Attention 2.0 is not available. Please refer to the documentation of https://github.com/Dao-AILab/flash-attention for installing it.Z
flash_attnz2.0.0zxYou need flash_attn package version to be greater than 2.0. Make sure to have that version installed - detected version Zuse_bettertransformerFzFlash Attention 2 and BetterTransformer API are not compatible. Please make sure to disable BetterTransformers by doing model.reverse_bettertransformer()NzwYou are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviourzVFlash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes. You passed z*, this might lead to unexpected behaviour.zYou are attempting to use Flash Attention 2.0 with a model initialized on CPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.a)  You are attempting to use Flash Attention 2.0 with a model initialized on CPU and with no GPU available. This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map or initialising the model on CPU and then moving it to GPU.r   r1  zYou are attempting to use Flash Attention 2.0 with a model dispatched on CPU or disk. This is not supported. Please make sure to initialise the model on a GPU by passing a device_map that contains only GPU devices as keys.T)_supports_flash_attn_2r   r8   rK  r   parse	importlibr   r  r   r   r_   r6  r   cudara   r   r  r   Z_flash_attn_2_enabled)r  ru  r  r;  Zflash_attention_versionZis_flash_greater_than_2Z_is_bettertransformerrf   rf   rg   _check_and_enable_flash_attn_2  s`    



z.PreTrainedModel._check_and_enable_flash_attn_2c                 C   s   dd }|   || _dS )z
        Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping
        the model weights fixed.
        c                 S   s   | d d S )NT)Zrequires_grad_)ru   inputoutputrf   rf   rg   make_inputs_require_grads2  s    zMPreTrainedModel.enable_input_require_grads.<locals>.make_inputs_require_gradsN)get_input_embeddingsrX  _require_grads_hook)r[  r  rf   rf   rg   enable_input_require_grads,  s    z*PreTrainedModel.enable_input_require_gradsc                 C   s   | j   dS )z4
        Removes the `_require_grads_hook`.
        N)r  remover^  rf   rf   rg   disable_input_require_grads7  s    z+PreTrainedModel.disable_input_require_gradsc                 C   s&   t | | j| }|| k	r| S tdS )z
        Returns the model's input embeddings.

        Returns:
            `nn.Module`: A torch module mapping vocabulary to hidden states.
        N)r  r  r  r   )r[  r  rf   rf   rg   r  =  s    z$PreTrainedModel.get_input_embeddingsr0  c                 C   s*   t | | j| }|| k	r"|| ntdS )z
        Set model's input embeddings.

        Args:
            value (`nn.Module`): A module mapping vocabulary to hidden states.
        N)r  r  set_input_embeddingsr   )r[  r0  r  rf   rf   rg   r  J  s    z$PreTrainedModel.set_input_embeddingsc                 C   s   dS )z
        Returns the model's output embeddings.

        Returns:
            `nn.Module`: A torch module mapping hidden states to vocabulary.
        Nrf   r^  rf   rf   rg   get_output_embeddingsW  s    z%PreTrainedModel.get_output_embeddingsc                 C   s   dS )z\
        Initialize the weights. This method should be overridden by derived class.
        Nrf   rZ  rf   rf   rg   ro   `  s    zPreTrainedModel._init_weightsc                 C   s$   t |ddrdS | | d|_dS )zM
        Initialize the weights if they are not already initialized.
        r   FNT)r  ro   r   rZ  rf   rf   rg   _initialize_weightsf  s    
z#PreTrainedModel._initialize_weightsc                 C   s   t | jddr.|  }|dk	r.| ||   t | jddrvt | jddrvt| | jrbt | | j} | | j| j	| j | 
 D ]}t|dr~|  q~dS )z
        Tie the weights between the input embeddings and the output embeddings.

        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the
        weights instead.
        tie_word_embeddingsTNZis_encoder_decoderFZtie_encoder_decoder_tie_weights)r  ru  r  _tie_or_clone_weightsr  r   r  _tie_encoder_decoder_weightsencoderdecoderr   r  )r[  output_embeddingsru   rf   rf   rg   tie_weightso  s    
zPreTrainedModel.tie_weights)r  r  r  c                    sz   g }|j | j kr*t|j  d| j  d dtjtjttt d fdd  || || t|dkrvtd|  d S )	N and zZ are not equal. In this case make sure that all encoder weights are correctly initialized.r   )decoder_pointerencoder_pointerr   uninitialized_encoder_weightsc                    s  t | tjrt |tjs,t|  d| dt| drpt|dsDt| j|_t| drlt|dsdt| j|_d S |j}| j}t|dkrt|dkstd| d|   fdd	|	 D }d}|
 D ]\}	}
|	 r(tt|	| }|	}t || t|| sRt|t|krR|d
8 }qn*|	|kr6qn|dkrJtdn|	 }}|| ||  d |	 ||d
 d | d |  q|t|7 }d S )Nr  z have to be of type nn.Moduler   r  r   zEncoder module z does not match decoder module c                    s   h | ]} d  | qS )/rf   )ry   Zsub_namer   rf   rg   	<setcomp>  s     zkPreTrainedModel._tie_encoder_decoder_weights.<locals>.tie_encoder_to_decoder_recursively.<locals>.<setcomp>r   i  zMax depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model.r  )depth)r   r   r   r  r   r   r  r  r   r   r   isdigitr   rj   typer   r  r   )r  r  r   r  r  Zencoder_modulesZdecoder_modulesZall_encoder_weightsZencoder_layer_posr  ru   Zencoder_nameZdecoder_name"tie_encoder_to_decoder_recursivelyr   rg   r    sb     






zXPreTrainedModel._tie_encoder_decoder_weights.<locals>.tie_encoder_to_decoder_recursivelyz;The following encoder weights were not tied to the decoder )r   )	r   r   r  r   r   r   r	   r   r   )r  r  r  r  rf   r  rg   r    s"    
 <z,PreTrainedModel._tie_encoder_decoder_weightsc                 C   s   | j jrt|j |_n|j|_t|dddk	rftj|j	j
d|jjd |j	jd  fdd|j	_
t|drt|dr|j|_dS )zPTie or clone module weights depending of whether we are using TorchScript or notr  Nr   Zconstantout_featuresnum_embeddings)ru  Ztorchscriptr   r#  r   cloner  
functionalpadr  datarn  r   r  r  )r[  r  Zinput_embeddingsrf   rf   rg   r    s    	z%PreTrainedModel._tie_or_clone_weights)new_num_tokenspad_to_multiple_ofrv   c                 C   sJ   |  ||}|dkr |dkr |S |jjd | j_|jjd | _|   |S )a  
        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.

        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.

        Arguments:
            new_num_tokens (`int`, *optional*):
                The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
                returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the embedding matrix to a multiple of the provided value.If `new_num_tokens` is set to
                `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
                details about this, or help on choosing the correct value for resizing, refer to this guide:
                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc

        Return:
            `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
        Nr   )_resize_token_embeddingsr   rn  ru  Z
vocab_sizer  )r[  r  r  Zmodel_embedsrf   rf   rg   resize_token_embeddings  s    z'PreTrainedModel.resize_token_embeddingsc           	   	   C   s   |   }| |||}t|dr0|j}t|| | | |d k	rt r|dd l}|jj	|j
d d |j
jd }W 5 Q R X n|j
jd }|  d k	r| jjs|  }| ||}t|dr|j}t|| | | |   S )N_hf_hookr   r	  )r  _get_resized_embeddingsr   r  rR   r  r   r
  r  r  r   rn  r  ru  r  _get_resized_lm_headZset_output_embeddings)	r[  r  r  old_embeddingsnew_embeddingshookr
  old_lm_headnew_lm_headrf   rf   rg   r  	  s(    





z(PreTrainedModel._resize_token_embeddings)r  r  r  rv   c           
   	   C   s  |dk	rLt |ts"td| d|dkr6|jjd }|| d | | }ntd| d |dkrj|S t rddl}|j	j
|jdd |j \}}W 5 Q R X n|j \}}||krt s|S t |tjstd	t| d
tj dtj dtj|||jj|jjd}| | t||}t rddl}|j|jg}	|j	j
|	dd2 |jjd|ddf |jjd|ddf< W 5 Q R X n,|jjd|ddf |jjd|ddf< |S )aj  
        Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
        initialized vectors at the end. Reducing the size will remove vectors from the end

        Args:
            old_embeddings (`torch.nn.Embedding`):
                Old embeddings to be resized.
            new_num_tokens (`int`, *optional*):
                New number of tokens in the embedding matrix.

                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
                vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
                `torch.nn.Embedding` module of the model without doing anything.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
                `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
                details about this, or help on choosing the correct value for resizing, refer to this guide:
                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


        Return:
            `torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if
            `new_num_tokens` is `None`
        Nz5Asking to pad the embedding matrix to a multiple of `z@`, which is not and integer. Please make sure to pass an integerr   r   zYou are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be a.  . This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tcr	  zOld embeddings are of type , which is not an instance of zj. You should either use a different resize function or make sure that `old_embeddings` are an instance of r   ri  )r   rj   r   r   rn  r   r  r   r
  r  r  sizer   r  	TypeErrorr  r   r   ro   rf  r  )
r[  r  r  r  r
  old_num_tokensZold_embedding_dimr  nparamsrf   rf   rg   r  &  sN    "




8,z'PreTrainedModel._get_resized_embeddings)r  r  
transposedrv   c              	   C   st  |dkr|S t  rZddl}|jj|jdd& |s<|j n|j  \}}W 5 Q R X n |sh|j n|j  \}}||krt  s|S t|tj	st
dt| dtj	 dtj	 d|s||fn||f}|jdk	}tj	|||jj|jjd}	| |	 t||}
t  r^ddl}|j|j|	j|	jg}|jj|dd | |	||
|| W 5 Q R X n| |	||
|| |	S )	aN  
        Build a resized Linear Module from a provided old Linear Module. Increasing the size will add newly initialized
        vectors at the end. Reducing the size will remove vectors from the end

        Args:
            old_lm_head (`torch.nn.Linear`):
                Old lm head liner layer to be resized.
            new_num_tokens (`int`, *optional*):
                New number of tokens in the linear matrix.

                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
                vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
                `torch.nn.Linear` module of the model without doing anything. transposed (`bool`, *optional*, defaults
                to `False`): Whether `old_lm_head` is transposed or not. If True `old_lm_head.size()` is `lm_head_dim,
                vocab_size` else `vocab_size, lm_head_dim`.

        Return:
            `torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if `new_num_tokens` is
            `None`
        Nr   r	  z#Old language model head is of type r  zg. You should either use a different resize function or make sure that `old_lm_head` are an instance of r   )r  r   r   )r   r
  r  r  r   r  r   r   r   Linearr  r  r  r   r   ro   rf  !_copy_lm_head_original_to_resized)r[  r  r  r  r
  r  Zold_lm_head_dimZnew_lm_head_shapehas_new_lm_head_biasr  num_tokens_to_copyr  rf   rf   rg   r    sX    


        z$PreTrainedModel._get_resized_lm_headc                 C   s   |s2|j jd |d d f |j jd |d d f< n,|j jd d d |f |j jd d d |f< |r~|jjd | |jjd |< d S r,  )r   r  r  )r[  r  r  r  r  r  rf   rf   rg   r    s
    .,z1PreTrainedModel._copy_lm_head_original_to_resized)new_num_position_embeddingsc                 C   s(   t d| j d| j d| jj dd S )Nz4`resize_position_embeddings` is not implemented for B`. To implement it, you should overwrite this method in the class  in `modeling_.py`r   r   r  )r[  r  rf   rf   rg   resize_position_embeddings  s    z*PreTrainedModel.resize_position_embeddingsc                 C   s(   t d| j d| j d| jj dd S )Nz1`get_position_embeddings` is not implemented for r  r  r  r  r^  rf   rf   rg   get_position_embeddings  s    z'PreTrainedModel.get_position_embeddingsc                 C   s2   | j jr| | j j tr.| | j |   dS )z
        If needed prunes and maybe initializes weights. If using a custom `PreTrainedModel`, you need to implement any
        initialization logic in `_init_weights`.
        N)ru  pruned_headsprune_headsro   applyr  r  r^  rf   rf   rg   r    s
    zPreTrainedModel.init_weights)heads_to_prunec                 C   sN   |  D ]4\}}t| jj|g t|B }t|| jj|< q| j| dS )a  
        Prunes heads of the base model.

        Arguments:
            heads_to_prune (`Dict[int, List[int]]`):
                Dictionary with keys being selected layer indices (`int`) and associated values being the list of heads
                to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on
                layer 1 and heads 2 and 3 on layer 2.
        N)r   r   ru  r  re   r   r  Z_prune_heads)r[  r   layerZheadsZunion_headsrf   rf   rg   r  
  s    zPreTrainedModel.prune_headsc                 C   sD   | j st| jj d| t| jdd t| ddr@|   dS )z
        Activates gradient checkpointing for the current model.

        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
        activations".
        z) does not support gradient checkpointing.Tr  _hf_peft_config_loadedFN)	r  r   r   r   r  r   _set_gradient_checkpointingr  r  r^  rf   rf   rg   r    s
    z-PreTrainedModel.gradient_checkpointing_enablec                 C   s2   | j r| t| jdd t| ddr.|   dS )z
        Deactivates gradient checkpointing for the current model.

        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
        activations".
        Fr  r  N)r  r  r   r  r  r  r^  rf   rf   rg   gradient_checkpointing_disable-  s    z.PreTrainedModel.gradient_checkpointing_disablec                 C   s   t dd |  D S )z
        Whether gradient checkpointing is activated for this model or not.

        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
        activations".
        c                 s   s   | ]}t |d o|jV  qdS )r  N)r   r  ry   mrf   rf   rg   r/  B  s     z<PreTrainedModel.is_gradient_checkpointing.<locals>.<genexpr>)r5  r   r^  rf   rf   rg   is_gradient_checkpointing:  s    z)PreTrainedModel.is_gradient_checkpointingTr   )
save_directoryis_main_processr   save_functionpush_to_hubr   safe_serializationrG  tokensave_peft_formatc           0   	      s  | dd}|dk	r4tdt |	dk	r0td|}	|	dk	rD|	|d< t| dd}t| ddrtt| d	dst|sttd
t| ddr|stdd|krtd | d}|rt stdt	j
|rtd| d dS t	j|dd |r.| dd}| d|t	j
jd }| j|f|}| |}t| }t|}t|dd |j_|jjg|j_| jdk	r~t| || jd |r8|s|j| |  r|j| |r8t d |! }|
rt d i }|" D ]\}}||d| < q|}| # }t$|dkrtd|d }| j%| }|| |dkrJ|& }t'rnt(j)j*j+D ]\}}||}qZ| j,dk	r| j,D ]}||- kr||= q|rt./t0}|" D ]\ }|t1| 2  qdd  |" D }t3 }|4 D ]}| j5dk	rZd} t6|D ]J t7 fd!d"| j5D }!|!r |kr| d7 } | t$|k r| = qd} |D ]2  |krb| d7 } | dkrb| = |8  qbqt$|dkrt9d#| d$ |s|rt:nt;}"t<|"|}"n|rt=nt>}"t?|||"d%\}#}$t	@|D ]}%t	j
A||%}&|"Bd&d'Bd(d'}'|%Bd&d'Bd(d'}(tCDd)})|%E|'rt	j
|&r|%|#- kr|r|)F|(dk	rt	G|& q|#" D ]B\}*}+|rtH|+t	j
A||*d*d+id, n||+t	j
A||* q|$dkrt	j
A|t<t;|},t d-|,  n||rtIntJ}-t	j
A|t<|-|}-tK|-d.d/d0$}.tLjM|$d1dd2d3 }/|.N|/ W 5 Q R X t d4| d5t$|# d6|- d |r| jO|||||	d7 dS )8a  
        Save a model and its configuration file to a directory, so that it can be re-loaded using the
        [`~PreTrainedModel.from_pretrained`] class method.

        Arguments:
            save_directory (`str` or `os.PathLike`):
                Directory to which to save. Will be created if it doesn't exist.
            is_main_process (`bool`, *optional*, defaults to `True`):
                Whether the process calling this is the main process or not. Useful when in distributed training like
                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
                the main process to avoid race conditions.
            state_dict (nested dictionary of `torch.Tensor`):
                The state dictionary of the model to save. Will default to `self.state_dict()`, but can be used to only
                save parts of the model or if special precautions need to be taken when recovering the state dictionary
                of a model (like when using model parallelism).
            save_function (`Callable`):
                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
                need to replace `torch.save` by another method.
            push_to_hub (`bool`, *optional*, defaults to `False`):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
            max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
                The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
                lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).

                <Tip warning={true}>

                If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
                which will be bigger than `max_shard_size`.

                </Tip>

            safe_serialization (`bool`, *optional*, defaults to `False`):
                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
            variant (`str`, *optional*):
                If specified, weights are saved in the format pytorch_model.<variant>.bin.
            token (`str` or `bool`, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
            save_peft_format (`bool`, *optional*, defaults to `True`):
                For backward compatibility with PEFT library, in case adapter weights are attached to the model, all
                keys of the state dict of adapters needs to be pre-pended with `base_model.model`. Advanced users can
                disable this behaviours by setting `save_peft_format` to `False`.
            kwargs (`Dict[str, Any]`, *optional*):
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        use_auth_tokenNVThe `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.V`token` and `use_auth_token` are both specified. Please set only the argument `token`.r  r  Fis_loaded_in_8bitis_8bit_serializablezYou are calling `save_pretrained` to a 8-bit converted model you may likely encounter unexepected behaviors. If you want to save 8-bit models, make sure to have `bitsandbytes>0.37.2` installed.r  z]You are calling `save_pretrained` on a 4-bit converted model. This is currently not supportedZsave_configze`save_config` is deprecated and will be removed in v5 of Transformers. Use `is_main_process` instead.zR`safe_serialization` requires the `safetensors library: `pip install safetensors`.zProvided path (z#) should be a directory, not a fileTexist_okcommit_messagerepo_idri   r   r   r  zhDetected adapters on the model, saving the model in the PEFT format, only adapter weights will be saved.zTo match the expected format of the PEFT library, all keys of the state dict of adapters will be pre-pended with `base_model.model`.zbase_model.model.zMultiple active adapters detected, saving multiple active adapters is not supported yet. You can save adapters separately one by one by iteratively calling `model.set_adapter(adapter_name)` then `model.save_pretrained(...)`r   c                 S   s"   i | ]\}}t |d kr||qS r   r   )ry   Zptrnamesrf   rf   rg   
<dictcomp>  s       z3PreTrainedModel.save_pretrained.<locals>.<dictcomp>c                 3   s   | ]}t | V  qd S r,  r   r   )ry   patr  rf   rg   r/    s     z2PreTrainedModel.save_pretrained.<locals>.<genexpr>zRemoved shared tensor zk while saving. This should be OK, but check by verifying that you don't receive any warning while reloading)r   r   r   r   r   z(.*?)-\d{5}-of-\d{5}r   r   )r   zModel weights saved in wr   r   ra  )indent	sort_keys
z:The model is bigger than the maximum size per checkpoint (z) and is going to be split in z^ checkpoint shards. You can find where each parameters has been saved in the index located at )r  r  )Pr  rj  rk  rl  r   r  r   r=   rK  rc   r   r   r   errormakedirsr  sepZ_create_repoZ_get_files_timestampsunwrap_modelr   r   ru  r  r   r   architectures_auto_classr   save_pretrainedr  r  r  Zget_adapter_state_dictr   Zactive_adaptersr   Zpeft_configr   IS_SAGEMAKER_MP_POST_1_10smpstateZmodule_managerZtranslate_functions_keys_to_ignore_on_saver   collectionsdefaultdictr   r   r   r   r   _tied_weights_keyssortedr5  addwarning_oncer(   r,   rH  r"   r#   r   listdirr   r   r   compiler   	fullmatchr  safe_save_filer'   r+   r   r   dumpswriteZ_upload_modified_files)0r[  r  r	  r   r
  r  r   r  rG  r  r  rQ  r  r  r  r  Zfiles_timestampsZmodel_to_saver   Zpeft_state_dictr   r0  Zactive_adapterZcurrent_peft_configZ	smp_to_hf_Z
ignore_keyptrsr  Zshared_ptrsZ
warn_namesr  foundZmatches_patternr   r   r   filenameZfull_filenameZweights_no_suffixZfilename_no_suffixregr   r   Zpath_to_weightsZsave_index_filer   contentrf   r  rg   r)  D  s4   = 



















zPreTrainedModel.save_pretrainedc                 C   s<   t dd |  D }|r8t dd |  D }|| }|S )a  
        Get the memory footprint of a model. This will return the memory footprint of the current model in bytes.
        Useful to benchmark the memory footprint of the current model and design some tests. Solution inspired from the
        PyTorch discussions: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2

        Arguments:
            return_buffers (`bool`, *optional*, defaults to `True`):
                Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers
                are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch
                norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2
        c                 S   s   g | ]}|  |  qS rf   ZnelementZelement_size)ry   rC  rf   rf   rg   r|   i  s     z8PreTrainedModel.get_memory_footprint.<locals>.<listcomp>c                 S   s   g | ]}|  |  qS rf   r@  )ry   bufrf   rf   rg   r|   k  s     )r  r   r   )r[  Zreturn_buffersrS  Zmem_bufsrf   rf   rg   get_memory_footprint]  s
    z$PreTrainedModel.get_memory_footprintc                    s.   t | dd tjkrtdnt j||S d S )Nquantization_methodzCalling `cuda()` is not supported for `4-bit` or `8-bit` quantized models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct `dtype`.)r  rJ   BITS_AND_BYTESr   r  r  r[  r  rQ  r  rf   rg   r  o  s
    zPreTrainedModel.cudac                    s.   t | dd tjkrtdnt j||S d S )NrC  z`.to` is not supported for `4-bit` or `8-bit` bitsandbytes models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct `dtype`.)r  rJ   rD  r   r  r$  rE  r  rf   rg   r$  z  s
    zPreTrainedModel.toc                    s&   t | ddrtdnt j| S d S )Nr@  Fz`.half()` is not supported for quantized model. Please use the model as it is, since the model has already been casted to the correct `dtype`.)r  r   r  halfr[  r  r  rf   rg   rF    s
    zPreTrainedModel.halfc                    s&   t | ddrtdnt j| S d S )Nr@  Fz`.float()` is not supported for quantized model. Please use the model as it is, since the model has already been casted to the correct `dtype`.)r  r   r  r   rG  r  rf   rg   r     s
    zPreTrainedModel.floatmain)ru  	cache_dirignore_mismatched_sizesforce_downloadlocal_files_onlyr  revisionuse_safetensors)	pretrained_model_name_or_pathru  rI  rJ  rK  rL  r  rM  rN  c          Y         s  | dd}| dd}| dd}| dd}| dd}| dd}| d	d}| d
d}| dd}| dd}| dd}| dd}| dd| dd}| dd | dd}| dd}| dd}| dd}| dd}| dd}| dd}| dd} | dd}!| di }"| dd}#| d d}$t rPd}|dk	r|td!t |dk	rxtd"|}|dk	r|"dk	rd#|"kr||"d#< |	dkrt sd}	t rt	t
jd$t	d%k}%nd}%|dkrtd& | dkrDt|ts8t|t||||||||ddd'}&t|&| } nt|dd} t r|" d(d}'|'dkrt|f|||||| d)|"}'|'dk	rtj|'rt|'d*d+d,}(|}'t|(d- }W 5 Q R X nd}'t tjrd i nzt tr4 d.kr4zdt i W n& tk
r0   td/  d0Y nX n(t t r\ d1k rTtd2nd i  dk	r|dkrvd}n|std3|r dk	rt!d4 t" rtd5nt# st$d6d})|dk	rt|d7t%j&})|dkr|s|rt%j&})t'j(f ||d8dd9|\}}n@|)t%j&krX|j)}|j*}d:d; |+ D }*t,|*d1krXtd<|sd|rt# rtt s|t$d=dkrt-d> d? tj. dkrtj/0 rdtj/1 i ntd@t-dA |dkrd}|s|rtdB||B  }+dCdD|dE},|dk	r ||,dF< t2 r<|s<t-dG d}t|ts|dk	rV|n|}-| j3j4|-f|d|||||||||dH|\}}.n|}.d}/d}0t5|dr|j67d7t%j&}0|0t%j8kr|)dk	r|9 }1|1+ D ]\}2}3|3|j6|2< qd})tdI |)t%j8ks|0t%j8krtj/0 s,tdJnNt: r<t; sFt$dKn4t	t
jdLt	dMk rnt$dNnd1dOl<m=}4 |0t%j8krt>(|j6}||_6dkrtj.n
t-dP |4(|? }/|%r|)t%j&kr|r|0t%j&krtdQ ||_6n|%r|s|0t%j&kr|j6}t|t@r2t'j(|ddR}n"t|t'r@ntdStA| dT|j)}|rڈdkrptj. dkrtj/0 rdtj/1 i ntd@t-dA |dkrd}n"|%s|s|0t%j&krtdU d}5d}6d}7dd}8|dk	rt|}tjB|}9|9
r|rNtjtjC||tDdV rNtjC||tDdV }:q|rtjtjC||tErtjC||tE}:q|rtjtjC||tFrtjC||tF}:q|	dk	rtjtjC||tGtH|!rtjC||tGtH|!}:q|	dk		rBtjtjC||tGtI|!	rBtjC||tGtI|!}:d}5qtjtjC||tGtJ|!	r|tjC||tGtJ|!}:qtjtjC||tGtK|!	rtjC||tGtK|!}:d}5ntjtjC||tDdV 	stjtjC||tE
rtLdWtGtJ|! dX| dYntjtjC||tF
rFtLdWtGtJ|! dX| dZnV|	
rjtLdWtGtH|! dX| d0n2tLdWtGtJ|! d[tE d[tDdV  d\tF dX| d0ntjtjC||
r|}:d}9ntjtjC||dV r|
std]|dV  d^tjC||dV }:d}9nvtM|r.|};tN|}<n\|r:tE};n,|rFtF};n |	dk	r\tGtH|!};n
tGtJ|!};z|||||||,||d| d_}=t||;f|=}<|<dkr|;tGtH|!krt|tGtI|!f|=}<|<dk	rd}5nB|	rtLd`tGtH|! d\tGtI|! dantGtJ|!};t||;f|=}<|<dkrP|;tGtJ|!krPt|tGtK|!f|=}<|<dk	rPd}5|<dkr&|||db}>tO|tEf|>rtL| dctGtJ|! dYntO|tFf|>rtL| dctGtJ|! dZnh|!dk	rtO|tJf|>rtL| dctGtJ|! dd|! den,tL| dctGtJ|! d[tE d[tD d\tF d0
W n` tLk
r@    Y nJ tPk
r   tLdf| dg| dhtGtJ|! d[tE d[tD d\tF d0Y nX |9rt-di|:  |:}<nt-di|; dj|<  nd}<|5rtQ||<|||||||,||| dk\}<}6|+r|5s
|dkr
tR|<}d}?dk	rttrdlkrt5|dr`|jSdk	r`|jSt-dm dn nJ|5rzdo|6krz|6do n&|5stT|ntR|<d1 }@tT|@~@t-dp ntdq | U}?| jVdk	otj.kp|p|}8|5r|6dr }AntW|X }A|s|8rt# rd}||_YtZ|dsg}Bt" r^d1dl[}Ct-dt |Cj\j]t^ dug|B }Bn|sp|sp|r||B_t`  |$r| ja| dv}tb|B | |f|
|.}DW 5 Q R X |8rt# rd}|DjVng |s|rdwdxlcmd}Eme}F |jf}G|jg}H|rt-dy n
t-dz |Gdkr,|E|Dn|GttWsBgh t t@rt, X dwkrd{d|  + D }It,|Id1kr|Hstd}h|I t	t
jd$t	d~k}J|r|Jstd|F|D|d}Dt	t
jd$t	dk|D_i||Djj_6|%|D_k|r&dkr&td |0t%j8krB|/l|D}Dd|D_i|0dk	rT|0|D_mn|)dk	rd|)|D_mt5|Ddrvd|D_nt tr\i }K|s|r|Kofdd;|Dp D  |Kofdd;|Dp D  }L|rt	t
jdt	dkrd1dlqmr}M |Mjs}Lntdn|rtjt}L|DjudkrBt|Djvjw d  d|Dju}N d.krZtdd|Ni}Odtxytzj{kr~|K|Od< nt,|Kd1krtd  dkrt||Df|L dk|d|O}nt}|}t|Dddt%j&krdd; |+ D }||Od< |D~  tz|Dfdo|Li|O |s|r fdd; X D }Pd|P ksPd|P krXtd~Pn$ dk	r|D~  t|D}Qt|Q  |r|<dVr| |D||<dd }DnHz$dwdlm}R |R|D|<ddd\}D}7W n" t$k
r   td  Y nX n|r:zdwdlm}S |S|D|<}DW n" t$k
r6   td  Y nX n\|+r|?dk	rTt|? | j|D||A|<|||6|| ||t|Dddt%j&kd\}D}T}U}V}W}X||D_||D_|D~  |D  |D r|dk	rz.tj4|f||||||||||d
||D_W n  tk
r   t-d Y nX  dk	rT ||Wd}Odtxytj{krH|Dj|Od< t|Df|O |)t%j8kr|jdkrr||_| jdkrtd|/|D|j t>(|/? |Djj_6d|D_i|0t%j8kr|/|D}D|'dk	r|Dj|'|#||"d |r|7dkr|T|U|V|Xd}7|D|7fS |DS )a<  
        Instantiate a pretrained pytorch model from a pre-trained model configuration.

        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
        the model, you should first set it back in training mode with `model.train()`.

        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
        task.

        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
        weights are discarded.

        Parameters:
            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
                Can be either:

                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                      user or organization name, like `dbmdz/bert-base-german-cased`.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
                    - A path or url to a model folder containing a *flax checkpoint file* in *.msgpack* format (e.g,
                      `./flax_model/` containing `flax_model.msgpack`). In this case, `from_flax` should be set to
                      `True`.
                    - `None` if you are both providing the configuration and state dictionary (resp. with keyword
                      arguments `config` and `state_dict`).
            model_args (sequence of positional arguments, *optional*):
                All remaining positional arguments will be passed to the underlying model's `__init__` method.
            config (`Union[PretrainedConfig, str, os.PathLike]`, *optional*):
                Can be either:

                    - an instance of a class derived from [`PretrainedConfig`],
                    - a string or path valid as input to [`~PretrainedConfig.from_pretrained`].

                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
                be automatically loaded when:

                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                      model).
                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
                      save directory.
                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
                      configuration JSON file named *config.json* is found in the directory.
            state_dict (`Dict[str, torch.Tensor]`, *optional*):
                A state dictionary to use instead of a state dictionary loaded from saved weights file.

                This option can be used if you want to create a model from a pretrained configuration but load your own
                weights. In this case though, you should check if using [`~PreTrainedModel.save_pretrained`] and
                [`~PreTrainedModel.from_pretrained`] is not a simpler option.
            cache_dir (`Union[str, os.PathLike]`, *optional*):
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
            from_tf (`bool`, *optional*, defaults to `False`):
                Load the model weights from a TensorFlow checkpoint save file (see docstring of
                `pretrained_model_name_or_path` argument).
            from_flax (`bool`, *optional*, defaults to `False`):
                Load the model weights from a Flax checkpoint save file (see docstring of
                `pretrained_model_name_or_path` argument).
            ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
                Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
                as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
                checkpoint with 3 labels).
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
            resume_download (`bool`, *optional*, defaults to `False`):
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            output_loading_info(`bool`, *optional*, defaults to `False`):
                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
            local_files_only(`bool`, *optional*, defaults to `False`):
                Whether or not to only look at local files (i.e., do not try to download the model).
            token (`str` or `bool`, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.

                <Tip>

                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".

                </Tip>

            mirror (`str`, *optional*):
                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
                Please refer to the mirror site for more information.
            _fast_init(`bool`, *optional*, defaults to `True`):
                Whether or not to disable fast initialization.

                <Tip warning={true}>

                One should only disable *_fast_init* to ensure backwards compatibility with `transformers.__version__ <
                4.6.0` for seeded model initialization. This argument will be removed at the next major version. See
                [pull request 11471](https://github.com/huggingface/transformers/pull/11471) for more information.

                </Tip>

            > Parameters for big model inference

            low_cpu_mem_usage(`bool`, *optional*):
                Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
                This is an experimental feature and a subject to change at any moment.
            torch_dtype (`str` or `torch.dtype`, *optional*):
                Override the default `torch.dtype` and load the model under a specific `dtype`. The different options
                are:

                1. `torch.float16` or `torch.bfloat16` or `torch.float`: load in a specified
                  `dtype`, ignoring the model's `config.torch_dtype` if one exists. If not specified
                  - the model will get loaded in `torch.float` (fp32).

                2. `"auto"` - A `torch_dtype` entry in the `config.json` file of the model will be
                  attempted to be used. If this entry isn't found then next check the `dtype` of the first weight in
                  the checkpoint that's of a floating point type and use that as `dtype`. This will load the model
                  using the `dtype` it was saved in at the end of the training. It can't be used as an indicator of how
                  the model was trained. Since it could be trained in one of half precision dtypes, but saved in fp32.

                <Tip>

                For some models the `dtype` they were trained in is unknown - you may try to check the model's paper or
                reach out to the authors and ask them to add this information to the model's card and to insert the
                `torch_dtype` entry in `config.json` on the hub.

                </Tip>

            device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
                A map that specifies where each submodule should go. It doesn't need to be refined to each
                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
                same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank
                like `1`) on which the model will be allocated, the device map will map the entire model to this
                device. Passing `device_map = 0` means put the whole model on GPU 0.

                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
                more information about each option see [designing a device
                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
            max_memory (`Dict`, *optional*):
                A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
                GPU and the available CPU RAM if unset.
            offload_folder (`str` or `os.PathLike`, *optional*):
                If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
            offload_state_dict (`bool`, *optional*):
                If `True`, will temporarily offload the CPU state dict to the hard drive to avoid getting out of CPU
                RAM if the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to
                `True` when there is some disk offload.
            load_in_8bit (`bool`, *optional*, defaults to `False`):
                If `True`, will convert the loaded model into mixed-8bit quantized model. To use this feature please
                install `bitsandbytes` (`pip install -U bitsandbytes`).
            load_in_4bit (`bool`, *optional*, defaults to `False`):
                If `True`, will convert the loaded model into 4bit precision quantized model. To use this feature
                install the latest version of `bitsandbytes` (`pip install -U bitsandbytes`).
            quantization_config (`Union[QuantizationConfigMixin,Dict]`, *optional*):
                A dictionary of configuration parameters or a QuantizationConfigMixin object for quantization (e.g
                bitsandbytes, gptq)
            subfolder (`str`, *optional*, defaults to `""`):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                specify the folder name here.
            variant (`str`, *optional*):
                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. `variant` is
                ignored when using `from_tf` or `from_flax`.
            use_safetensors (`bool`, *optional*, defaults to `None`):
                Whether or not to use `safetensors` checkpoints. Defaults to `None`. If not specified and `safetensors`
                is not installed, it will be set to `False`.

            kwargs (remaining dictionary of keyword arguments, *optional*):
                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
                automatically loaded:

                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
                      already been done)
                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
                      corresponds to a configuration attribute will be used to override said attribute with the
                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
                      will be passed to the underlying model's `__init__` function.

        <Tip>

        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
        use this method in a firewalled environment.

        </Tip>

        Examples:

        ```python
        >>> from transformers import BertConfig, BertModel

        >>> # Download model and configuration from huggingface.co and cache.
        >>> model = BertModel.from_pretrained("bert-base-uncased")
        >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
        >>> model = BertModel.from_pretrained("./test/saved_model/")
        >>> # Update configuration during loading.
        >>> model = BertModel.from_pretrained("bert-base-uncased", output_attentions=True)
        >>> assert model.config.output_attentions == True
        >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
        >>> config = BertConfig.from_json_file("./tf_model/my_tf_model_config.json")
        >>> model = BertModel.from_pretrained("./tf_model/my_tf_checkpoint.ckpt.index", from_tf=True, config=config)
        >>> # Loading from a Flax checkpoint file instead of a PyTorch model (slower)
        >>> model = BertModel.from_pretrained("bert-base-uncased", from_flax=True)
        ```

        * `low_cpu_mem_usage` algorithm:

        This is an experimental function that loads the model using ~1x model size CPU memory

        Here is how it works:

        1. save which state_dict keys we have
        2. drop state_dict before the model is created, since the latter takes 1x model size CPU memory
        3. after the model has been instantiated switch to the meta device all params/buffers that
        are going to be replaced from the loaded state_dict
        4. load state_dict 2nd time
        5. replace the params/buffers from the state_dict

        Currently, it can't handle deepspeed ZeRO stage 3 and ignores loading errors

        r   Nfrom_tfF	from_flaxresume_downloadproxiesoutput_loading_infor  trust_remote_codeZmirror_from_pipeline
_from_auto
_fast_initTr  low_cpu_mem_usager;  
max_memoryr<  offload_state_dictload_in_8bitload_in_4bitquantization_config	subfolderr   _commit_hashrG  adapter_kwargsadapter_namedefaultuse_flash_attention_2r  r  r  r  z0.37.2zgThe argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.)
rI  rK  rR  rS  rL  r  rM  r_  %_raise_exceptions_for_missing_entriesZ'_raise_exceptions_for_connection_errors_adapter_model_path)rI  rK  rR  rS  rL  r`  r   r   r   Zbase_model_name_or_path)autoZbalancedbalanced_low_0
sequentialzWhen passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or 'auto', 'balanced', 'balanced_low_0', 'sequential' but found r   r   znYou can't pass device_map as a negative int. If you want to put the model on the cpu, pass device_map = 'cpu' z>Passing along a `device_map` requires `low_cpu_mem_usage=True`ztorch>=1.10z`DeepSpeed Zero-3 is not compatible with `low_cpu_mem_usage=True` or with passing a `device_map`.z^Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install accelerate`Zquant_method)r\  r]  )Zconfig_dictreturn_unused_kwargsc                 S   s&   i | ]\}}|t tjkr||qS rf   )r7  r8  rH   r   rx   rf   rf   rg   r  *
  s      z3PreTrainedModel.from_pretrained.<locals>.<dictcomp>zYou can't pass `load_in_8bit` or any other `BitsAndBytesConfig` argument as a kwarg when passing `quantization_config` argument at the same time.zUsing `load_in_8bit=True` requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes `pip install -i https://test.pypi.org/simple/ bitsandbytes` or pip install bitsandbytes` zOverriding torch_dtype=z with `torch_dtype=torch.float16` due to requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass torch_dtype=torch.float16 to remove this warning.z/No GPU found. A GPU is needed for quantization.zThe device_map was not initialized.Setting device_map to {'':torch.cuda.current_device()}.If you want to use the model for inference, please set device_map ='auto' zConverting into 4-bit or 8-bit weights from tf/flax weights is currently not supported, please make sure the weights are in PyTorch format.r   Zpytorch)	file_typer   from_auto_classZusing_pipelinez+Offline mode: forcing local_files_only=True)rI  rj  rK  rR  rS  rL  r  rM  r_  rW  rV  aS  You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. disable_exllama, use_cuda_fp16, max_input_length) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.z2GPU is required to quantize or run quantize model.zwLoading a GPTQ quantized model requires optimum (`pip install optimum`) and auto-gptq library (`pip install auto-gptq`)Z	auto_gptqz0.4.2zWYou need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq`)GPTQQuantizerzRWe suggest you to set `torch_dtype=torch.float16` for better efficiency with GPTQ.zYou passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute. The `quantization_config` attribute will be overwritten with the one you passed to `from_pretrained`.)rj  z(Invalid type for `quantization_config`: z8. Should be a `dict` or a `BitsAndBytesConfig` instance.a  Detected the presence of a `quantization_config` attribute in the model's configuration but you don't have the correct `bitsandbytes` version to support int8 serialization. Please install the latest version of `bitsandbytes` with  `pip install --upgrade bitsandbytes`.z.indexzError no file named z found in directory zf but there is a file for TensorFlow weights. Use `from_tf=True` to load this model from those weights.zb but there is a file for Flax weights. Use `from_flax=True` to load this model from those weights.z, r   z$We found a TensorFlow checkpoint at z:, please set from_tf to True to load from this checkpoint.)rI  rK  rS  rR  rL  r  
user_agentrM  r_  re  r`   z and thus cannot be loaded with `safetensors`. Please make sure that the model has been saved with `safe_serialization=True` or do not set `use_safetensors=True`.)rM  rS  r  z& does not appear to have a file named z) but there is a file without the variant z;. Use `variant=None` to load this model from those weights.zCan't load the model for 'z'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'z=' is the correct path to a directory containing a file named zloading weights file z from cache at )
rI  rK  rS  rR  rL  r  rn  rM  r_  r`  rg  zWill use torch_dtype=z$ as defined in model's config objectr   zSince the `torch_dtype` attribute can't be found in model's config object, will use torch_dtype={torch_dtype} as derived from model's weightszD`torch_dtype` can be either `torch.dtype` or `"auto"`, but received Zall_checkpoint_keys)rp   r  r  )r  r;  r   )get_keys_to_not_convertreplace_with_bnb_linearz?Detected 8-bit loading: activating 8-bit loading for this modelz?Detected 4-bit loading: activating 4-bit loading for this modelc                 S   s   g | ]\}}|d kr|qS ))r1  r   rf   ry   r   r0  rf   rf   rg   r|   .  s      z3PreTrainedModel.from_pretrained.<locals>.<listcomp>zIf you want to offload some keys to `cpu` or `disk`, you need to set `llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be  converted to 8-bit but kept in 32-bit.z0.39.0zYou have a version of `bitsandbytes` that is not compatible with 4bit inference and training make sure you have the latest version of `bitsandbytes` installed)modules_to_not_convertr^  z0.37.0zYou are loading your model in 8bit but you did not specify a `torch_dtype` attribute.All non-linear modules will be loaded in full precision. If you want to load the other modules in other precision, please specify a `torch_dtype` attribute.rC  c                    s,   i | ]$\ }t  fd dD r qS )c                 3   s   | ]}| kV  qd S r,  rf   r  r  rf   rg   r/  d  s     =PreTrainedModel.from_pretrained.<locals>.<dictcomp>.<genexpr>)r5  ry   r:  )rs  r  r  rg   r  a  s    c                    s.   i | ]&\ }t  fd dD r tjqS )c                 3   s   | ]}| kV  qd S r,  rf   r  r  rf   rg   r/  l  s     rt  )r5  r_   r   ru  )rB  r  rg   r  i  s    
acceleratez0.19.0)CustomDtypeaU  You are using `device_map='auto'` on a 4bit loaded version of the model. To automatically compute the appropriate device map, you should upgrade your `accelerate` library,`pip install --upgrade accelerate` or install it from source to support fp4 auto device mapcalculation. You may encounter unexpected behavior, or pass your own device mapz does not support `device_map='z^'`. To implement support, the modelclass needs to implement the `_no_split_modules` attribute.ziIf passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or 'sequential'.Zno_split_module_classesspecial_dtypeszThis model has some weights that should be kept in higher precision, you need to upgrade `accelerate` to properly deal with them (`pip install --upgrade accelerate`).ri  rh  )r   Zlow_zerorZ  c                 S   s   i | ]\}}||d  qS )g?rf   )ry   r   valrf   rf   rg   r    s      c                    s   i | ]}|kr| | qS rf   rf   r   )r;  rs  rf   rg   r    s      r   r1  ad  
                        Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
                        the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
                        these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom
                        `device_map` to `from_pretrained`. Check
                        https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
                        for more details.
                        i)$load_tf2_checkpoint_in_pytorch_model)Zallow_missing_keysrT  zLoading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.)%load_flax_checkpoint_in_pytorch_modelzLoading a Flax model in PyTorch, requires both PyTorch and Flax to be installed. Please see https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for installation instructions.)
rJ  sharded_metadatarX  rY  r;  r<  r[  r   r@  rB  )
rI  rK  rR  rS  rL  r  rM  r_  rW  rV  zZGeneration config file not found, using a generation config created from the model config.)r;  Zoffload_dirr=  	skip_keysr  z%We can only quantize pure text model.)rb  r  ra  )r   r   mismatched_keysr  )r  rh   rj  rk  rl  r   r=   r7   r   r  r  r   r   r   r   r   r0   r$   r3   r  r;   rn   rc   r   r   r   r   r   r_   r   r   r   rj   rK   r   r5   rK  rJ   rD  rH   	from_dictr\  r]  r   r   r  r6  r  ra   Zcurrent_devicer9   config_classfrom_pretrainedr   r^  re   ZGPTQZget_loading_attributesr:   r6   Zoptimum.gptqrm  rI   to_dictr  r  isdirr   r*   r)   r&   rH  r(   r'   r,   r+   EnvironmentErrorr<   r2   r4   r   rC   r   r  r   r  _keep_in_fp32_modulesr   r   r  rq   r
  r  r  r   r   rQ   r  r-   r4  rp  rq  llm_int8_skip_modulesZ llm_int8_enable_fp32_cpu_offloadextendZ_is_quantized_training_enabledru  r  Zconvert_modelrC  r@  updater  accelerate.utilsrw  ZINT4r9  _no_split_modulesr   r   r7  r8  rP   r   rU   rV   r  r   rT   rS   r   Zload_tf_weightsZmodeling_tf_pytorch_utilsrz  r#  Zmodeling_flax_pytorch_utilsr{  r  _load_pretrained_modelr  r  evalr  r   r  r   rO   _skip_keys_device_placementZ	tokenizerr  Zquantize_modelZpost_init_modelZload_adapter)Yr  rO  ru  rI  rJ  rK  rL  r  rM  rN  Z
model_argsrQ  r   rP  rQ  rR  rS  rT  r  rU  r:  Zfrom_pipelinerl  rX  rY  rZ  r<  r[  r\  r]  r^  r_  Zcommit_hashrG  ra  rb  rd  r  Zresolved_config_filerf  r   Zquantization_method_from_argsZquantization_config_kwargsZfrom_ptrn  Zconfig_pathZmodel_kwargsZ	quantizerZquantization_method_from_configZloading_attr_dictattrry  rm  Z
is_shardedr|  Zloading_infoZuse_keep_in_fp32_modulesis_localarchive_filer=  resolved_archive_fileZcached_file_kwargsZhas_file_kwargsr  Zone_state_dictr&  Zinit_contextsr
  r   rp  rq  r  Zload_in_8bit_fp32_cpu_offloadZkeys_on_cpuZsupports_4bitrx  target_dtyperw  Zno_split_modulesZdevice_map_kwargsZdevice_map_without_lm_headtied_paramsrz  r{  r   r   r~  r=  r  rf   )r;  rB  rs  r  rg   r    s    v
 











  






 






          ,




 

 &.














  










   



	




zPreTrainedModel.from_pretrainedc           5         s
  d}|rddl m} |
d k	rd|
 krt|ttfr>|d n|d}|d kr`|s`td|d k	rvtj	|dd	 |d krd}|o|d k	}|
  | }t| }|jd
d  |} fdd|D }tdkr
tfdd|D }tfdd|D }nd}d}| o|}|o(| }|r` dfdd|D fdd|D }n|rxfdd|D }tt|t| }t|t| }dd | D }|rȇfdd|D }n|rfdd|D }t|| }|
  |
d krRt sRtt}|  D ] \
}t|}|| 
 qdd | D } nt|} | D ]||r|fddD n|rfddD fdd|D 	t	dkr^t	tk r^	fdd|D }q^jd k	rjD ]fdd|D }qjd k	r4jD ]fdd|D }q|	rl|D ]*t| kr\nv d t| kr d nJrdddd  t| krdddd  | }!|}"|d k	r|tjkrtfd d|D rtj }"|!j!t!d!kr>|sJt"|d"tj#|!$ d#|"i n||d"tj#|!$ d#|"i q>|r|rfd$d|D }#n|rfd%d|D }#n|}#t%||# |&|j' |d k	r|( D ]2\
}!t
fd&d|D r|!j)*tj |!_)qd'}$|}%tjdkr<t+|js<|r<jd }$tjdkrt+|jr|st,|j}%t|%  tfd(d|D rtd)|
d k	rfd*d+|
 D }
fd,d-}&|d k	rtj-j.|d tj-j.d d. nd |
d k	r|rt/|
||d k	r2t0|1d/d'nd0|d krpt|ttfrX|d n|fd1d+|D }'nfd2d+|d3  D }'fd4d+|' D }(|d k	r|&||||||})t2|%||$}*d }(nbt|ts|g}g }*g })|s|
d k	rd|
 kri nd }(|r&t34 }+i },nd }+d },|rTt5|
|d5}-fd6d|-D }-ng }-t|dkrtt6j7|d7d8}|D ]}.|.|-krqxt8|.}|)|&||||||7 })|	rdt rt9 rt:|%|||$||
||(|+|,||||d9\}/}(},|*|/7 }*nn|%  D ]`\}!|!j!t!d!kr |s@t"|%d"tj#|!$ d#|i n||%d"tj#|!$ d#|i q n|*t2|%||$7 }*~t;<  qx|(d k		rt|(dk	r||%k	rj|s|(D ]8}0t=>tj-||0 d:tj-| d|0 d: qfd;d+|( D }(|	st?|(| d }(|	r4t@|%|,|+ t=A|+ t|*dk	rvd<|*}1d=|1k	r^|1d>7 }1tBd?|jCjD d@|1 |	rdAd |D }dBd |D }t|dk
r|jEjFd k	rg n|jEjF}2|jCjD|2k	rtGjHntGjI}3|3dC| dD|jCjD dE| dF|jCjD dG|jCjD dH ntGIdI|jCjD dJ t|dk
rXtGHdK|jCjD dL| dM| dN n4t|)dk
rtGIdO|jCjD dP| dQ|jCjD dR t|)dk
rdSdTd |)D }4tGHdK|jCjD dL| dU|4 dN ||||)|(|*fS )VNFr   r*  r1  r   r   zThe current `device_map` had weights offloaded to the disk. Please provide an `offload_folder` for them. Alternatively, make sure you have `safetensors` installed if the model you are using offers the weights in this format.Tr  c                 S   s,   d| kr|  ddS d| kr(|  ddS | S )Nr  r  r   r   )r   r   rf   rf   rg   _fix_keyx  s
    z8PreTrainedModel._load_pretrained_model.<locals>._fix_keyc                    s   g | ]} |qS rf   rf   r   )r  rf   rg   r|     s     z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>c                 3   s   | ]}|  V  qd S r,  r  ry   sr  rf   rg   r/    s     z9PreTrainedModel._load_pretrained_model.<locals>.<genexpr>c                 3   s   | ]}|  V  qd S r,  r  r  r  rf   rg   r/    s     r   c                    s   g | ]}|  s|qS rf   r  r  _prefixrf   rg   r|     s     
 c                    s*   g | ]"}|  r"|t d  n|qS r,  r   r   r  r  rf   rg   r|     s     c                    s   g | ]}d   |gqS r   r   r  r  rf   rg   r|     s     c                 S   s   h | ]\}}|qS rf   rf   )ry   r  r:  rf   rf   rg   r    s     z9PreTrainedModel._load_pretrained_model.<locals>.<setcomp>c                    s*   h | ]"}|  r"|t d  n|qS r,  r  r   r  rf   rg   r    s     c                    s   h | ]}d   |gqS r  r  r   r  rf   rg   r    s     c                 S   s    g | ]\}}t |d kr|qS r  r  )ry   r:  r  rf   rf   rg   r|     s      c                    s*   g | ]"}|  r"|t d  n|qS r,  r  r   r  rf   rg   r|     s     c                    s   g | ]}d   |gqS r  r  r   r  rf   rg   r|     s     c                    s   g | ]}| kr|qS rf   rf   r   )grouprf   rg   r|     s      c                    s   g | ]}| kr|qS rf   rf   r   )missing_in_grouprf   rg   r|     s      c                    s    g | ]}t  |d kr|qS r,  r  r   r  rf   rg   r|     s      c                    s    g | ]}t  |d kr|qS r,  r  r   r  rf   rg   r|     s      c                 3   s   | ]}| kV  qd S r,  rf   r-  r  rf   rg   r/    s     r   r   r   c                    s   g | ]}  d | qS r  rf   r   r  rf   rg   r|     s     c                    s    g | ]}|t  d  d qS )r   Nr  r   r  rf   rg   r|     s     c                 3   s   | ]}| kV  qd S r,  rf   r-  r  rf   rg   r/    s     r   c                 3   s   | ]}|ko| kV  qd S r,  rf   r   )base_model_expected_keysexpected_keys_not_prefixedrf   rg   r/    s     zjThe state dictionary of the model you are trying to load is corrupted. Are you sure it was properly saved?c                    s&   i | ]\}}|  j d d|qS r   )r   r  rx   r  rf   rg   r    s      z:PreTrainedModel._load_pretrained_model.<locals>.<dictcomp>c           	         s   g }|r|D ]}|| krq|}|r2  d| }n|rNd |ddd  }||kr| | j|| jkr||| | j|| jf | |= q|S )Nr   r   )r   r  rn  r   )	r   model_state_dictr   add_prefix_to_modelremove_prefix_from_modelrJ  r~  Zcheckpoint_keyZ	model_keyr  rf   rg   _find_mismatched_keys   s&    zEPreTrainedModel._load_pretrained_model.<locals>._find_mismatched_keysri   ztorch.r   c                    s   i | ]
}| qS rf   rf   ry   p)r  rf   rg   r  .  s      c                    s    i | ]\}}|t j |qS rf   rc   r   r   ry   r  r   r   rf   rg   r  0  s      r   c                    s*   i | ]"\}} | d kr|||dqS )r1  )Zsafetensors_fileweight_namer   rf   r  )param_device_map	str_dtyperf   rg   r  1  s    )r|  c                    s   g | ]}t j |qS rf   r  )ry   r   r  rf   rg   r|   W  s     zLoading checkpoint shards)desc)	r;  r<  r=  r>  r?  r   r@  rA  rB  z.datc                    s    i | ]\}}  d | |qS r  rf   rr  r  rf   rg   r    s      z
	zsize mismatchz_
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.r   z:
	c                 S   s   g | ]}d |kr|qS r2  rf   ry   elemrf   rf   rg   r|     s      c                 S   s   g | ]}d |kr|qS r  rf   r  rf   rf   rg   r|     s      z(Some weights of the model checkpoint at z! were not used when initializing z: z,
- This IS expected if you are initializing z from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing z from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).z9All model checkpoint weights were used when initializing z.
zSome weights of z3 were not initialized from the model checkpoint at z and are newly initialized: zo
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.zAll the weights of z/ were initialized from the model checkpoint at zf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use z* for predictions without further training.r"  c              	   S   s*   g | ]"\}}}d | d| d| dqS )z- z: found shape z in the checkpoint and z in the model instantiatedrf   )ry   r   Zshape1Zshape2rf   rf   rg   r|     s   z= and are newly initialized because the shapes did not match:
)Jr4  r+  r   r   r   r   r   r   rc   r$  r  r   r   r  r   r5  r   Znamed_buffersrh   r.  r/  r   r   r   rT   _keys_to_ignore_on_load_missing"_keys_to_ignore_on_load_unexpectedr   r   r  r_   r6  r   r   rZ   emptyr  r   r  r  r  r  r$  r   r  r   r%  expand_device_mapr   r   r  tempfilemkdtempget_disk_only_shard_filesr?   Ztqdmr   rk   rF  r   r   shutilmoverY   rW   rmtreer   r   r   ru  r'  r   r   r  )5r  r   r   r   r  rO  rJ  r|  rX  rY  r;  r<  r[  r   r@  rB  rA  r+  Zis_sharded_safetensorsr  r:  Zoriginal_loaded_keysZhas_prefix_moduleZexpects_prefix_moduler  r  r   r   Zmodel_buffersr;  r  Z	id_tensorr  rC  r  Z_loaded_keysr  r  r  r   r=  r~  r  r>  r?  Zdisk_only_shard_filesr   Znew_error_msgsr  	error_msgZarchsZwarnerZmismatched_warningrf   )r  r  r  r  r  r  r   r  r   r  r  r  r  r  r  rg   r  G  s   


 


2    

$
$
 
&


 


      





.
z&PreTrainedModel._load_pretrained_modelc           	      C   s   dd |D }| dd |D }g }|  D ]p\}}|rf| j d}||r`|t|d  n|}n&|rt|dkrd| j|gn| j}||kr.|| q.|S )Nc                 S   s$   h | ]}d  |d dd qS )r   Nri   )r   r  r   rf   rf   rg   r    s     z>PreTrainedModel.retrieve_modules_from_names.<locals>.<setcomp>c                 S   s<   h | ]4}t |d kr|d  rd|ddd qS )r   ri   r   N)r   r  r   r  r   rf   rf   rg   r    s       r   r   )unionr   r  r   r   r   r   )	r[  r  Z
add_prefixZremove_prefixZmodule_keysZretrieved_modulesr  ru   r  rf   rf   rg   retrieve_modules_from_names  s     "z+PreTrainedModel.retrieve_modules_from_namesc                 C   s&   t | || t|}t| |||}|S )a~  
        This is an experimental function that loads the model using ~1.x model size CPU memory

        Before you call it do:

        1. save which state_dict keys are available
        2. drop state_dict before model is created, since the latter takes 1x model size memory

        Here then we continue:

        3. switch to the meta device all params/buffers that are going to be replaced from the loaded state_dict
        4. load state_dict 2nd time
        5. replace the params/buffers from the state_dict

        Currently, it doesn't handle missing_keys, unexpected_keys, mismatched_keys. It can't handle deepspeed.
        )r)  r   rF  )r   r&  r  r  r   r  rf   rf   rg   _load_pretrained_model_low_mem  s    z.PreTrainedModel._load_pretrained_model_low_mem	AutoModelc                 C   sD   t |ts|j}ddlm  m} t||s:t| d|| _dS )a  
        Register this class with a given auto class. This should only be used for custom models as the ones in the
        library are already mapped with an auto class.

        <Tip warning={true}>

        This API is experimental and may have some slight breaking changes in the next releases.

        </Tip>

        Args:
            auto_class (`str` or `type`, *optional*, defaults to `"AutoModel"`):
                The auto class to register this new model with.
        r   Nz is not a valid auto class.)	r   r   r   Ztransformers.models.automodelsrg  r   r   r(  )r  Z
auto_classZauto_modulerf   rf   rg   register_for_auto_class  s    

z'PreTrainedModel.register_for_auto_classc                 C   sT   t  stdddlm} t|tdk r>td| dddlm} || S )a(  
        Converts the model to use [PyTorch's native attention
        implementation](https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html), integrated to
        Transformers through [Optimum library](https://huggingface.co/docs/optimum/bettertransformer/overview). Only a
        subset of all Transformers models are supported.

        PyTorch's attention fastpath allows to speed up inference through kernel fusions and the use of [nested
        tensors](https://pytorch.org/docs/stable/nested.html). Detailed benchmarks can be found in [this blog
        post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2).

        Returns:
            [`PreTrainedModel`]: The model converted to BetterTransformer.
        <The package `optimum` is required to use Better Transformer.r   rl   1.7.0EPlease install optimum>=1.7.0 to use Better Transformer. The version  was found.BetterTransformer)	r:   rK  optimum.versionrm   r   r  optimum.bettertransformerr  Z	transformr[  Zoptimum_versionr  rf   rf   rg   to_bettertransformer&  s    
z$PreTrainedModel.to_bettertransformerc                 C   sT   t  stdddlm} t|tdk r>td| dddlm} || S )a  
        Reverts the transformation from [`~PreTrainedModel.to_bettertransformer`] so that the original modeling is
        used, for example in order to save the model.

        Returns:
            [`PreTrainedModel`]: The model converted back to the original modeling.
        r  r   rl   r  r  r  r  )	r:   rK  r  rm   r   r  r  r  reverser  rf   rf   rg   reverse_bettertransformerB  s    
z)PreTrainedModel.reverse_bettertransformerc              
   C   s   t |stj st rdS |dk	s0| jjdkr4dS | jj|ddddgf krd}| jjdk	rp| jj| jjks| jjdk	r| jj| jjks| jj	dk	r| jj	| jjkr|d| jj d| jj d| jj d| jj	 d		7 }t
| dS )
zv
        Shows a one-time warning if the input_ids appear to contain padding and no attention mask was given.
        Nri   r   zWe strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.z5
You may ignore this warning if your `pad_token_id` (z&) is identical to the `bos_token_id` (z), `eos_token_id` (z), or the `sep_token_id` (z ), and your input is not padded.)rF   r_   Zjit
is_tracingrG   ru  Zpad_token_idZbos_token_idZeos_token_idZsep_token_idr   r3  )r[  r  rq  Zwarn_stringrf   rf   rg   %warn_if_padding_and_no_attention_maskX  s.    	


,z5PreTrainedModel.warn_if_padding_and_no_attention_mask)NN)NN)N)NN)NF)T)
FNTFNNNNFN)FF)r   )r  )Zr   r  r  r  r  r  r  r(  r  r  r  r  r  r-  r0  Zis_parallelizabler  r  r  r   r   r_   r   r  r   r   r  r  r  classmethodr  r   r  r   r   r  r   r  r
   r   rj   r  r  r  r  r  r  ro   r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r	   r  r  r  r  saverc   PathLiker  r   r)  rB  r   r  r$  rF  r   r  r  r  r  r  r  r  r  __classcell__rf   rf   r  rg   r  -  sH  
     M		J    &
   h     S
  




         6             
r  r   r  z
model file)objectZobject_classZobject_filesc                       sD   e Zd ZdZed fddZd	ejeej ejdddZ	  Z
S )
PoolerStartLogitsz
    Compute SQuAD start logits from sequence hidden states.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model.
    r  c                    s   t    t|jd| _d S )Nr   )r  r  r   r  hidden_sizedenser[  ru  r  rf   rg   r    s    
zPoolerStartLogits.__init__N)hidden_statesp_maskrv   c                 C   sT   |  |d}|dk	rPt| tjkr<|d|  d|  }n|d|  d|  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                should be masked.

        Returns:
            `torch.FloatTensor`: The start logits for SQuAD.
        ri   Nr     ꌠ9Y>)F)r  squeezer   r_   r6  )r[  r  r  xrf   rf   rg   forward  s    zPoolerStartLogits.forward)N)r   r  r  r  r   r  r_   FloatTensorr
   r  r  rf   rf   r  rg   r    s     r  c                       sT   e Zd ZdZed fddZd	ejeej eej	 eej ejdddZ
  ZS )
PoolerEndLogitsz
    Compute SQuAD end logits from sequence hidden states.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
            to use.
    r  c                    sR   t    t|jd |j| _t | _tj|j|j	d| _t|jd| _
d S )Nra  )Zepsr   )r  r  r   r  r  dense_0Tanh
activation	LayerNormZlayer_norm_epsdense_1r  r  rf   rg   r    s
    

zPoolerEndLogits.__init__N)r  start_statesstart_positionsr  rv   c                 C   s   |dk	s|dk	st d|dk	rh|jdd \}}|ddddf dd|}|d|}|d|d}| tj||gdd}| |}| |}| 	|
d}|dk	rt| tjkr|d|  d|  }n|d|  d|  }|S )	a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
                The hidden states of the first tokens for the labeled span.
            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                The position of the first token for the labeled span.
            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                should be masked.

        <Tip>

        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
        `start_states`.

        </Tip>

        Returns:
            `torch.FloatTensor`: The end logits for SQuAD.
        N7One of start_states, start_positions should be not Noner  ri   rd  r   r  r  )r  rn  r~  gatherr  r_   ro  r  r  r  r  r   r6  )r[  r  r  r  r  slenhszr  rf   rf   rg   r    s(    

zPoolerEndLogits.forward)NNNr   r  r  r  r   r  r_   r  r
   
LongTensorr  r  rf   rf   r  rg   r    s   	
   r  c                       sN   e Zd ZdZ fddZdejeej eej eej ejdddZ	  Z
S )	PoolerAnswerClassz
    Compute SQuAD 2.0 answer class from classification and start tokens hidden states.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model.
    c                    sB   t    t|jd |j| _t | _tj|jddd| _d S )Nra  r   F)r  )	r  r  r   r  r  r  r  r  r  r  r  rf   rg   r    s    

zPoolerAnswerClass.__init__N)r  r  r  	cls_indexrv   c                 C   s   |j d }|dk	s"|dk	s"td|dk	rX|ddddf dd|}|d|d}|dk	r|ddddf dd|}|d|d}n|dddddf }| tj||gdd}| |}| 	|d}|S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
                The hidden states of the first tokens for the labeled span.
            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                The position of the first token for the labeled span.
            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.

        <Tip>

        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
        `start_states`.

        </Tip>

        Returns:
            `torch.FloatTensor`: The SQuAD 2.0 answer class.
        ri   Nr  r  r  )
rn  r  r~  r  r  r  r_   ro  r  r  )r[  r  r  r  r  r  Zcls_token_stater  rf   rf   rg   r    s$    

zPoolerAnswerClass.forward)NNN)r   r  r  r  r  r_   r  r
   r  r  r  rf   rf   r  rg   r    s   	   r  c                   @   s~   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeej ed< dS )	SquadHeadOutputa  
    Base class for outputs of question answering models using a [`~modeling_utils.SQuADHead`].

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
            Classification loss as the sum of start token, end token (and is_impossible if provided) classification
            losses.
        start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
        start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Indices for the top config.start_n_top start token possibilities (beam-search).
        end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
            (beam-search).
        end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
        cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the `is_impossible` label of the answers.

    Nlossstart_top_log_probsstart_top_indexend_top_log_probsend_top_index
cls_logits)r   r  r  r  r  r
   r_   r  __annotations__r  r  r  r  r  r  rf   rf   rf   rg   r  /  s   
r  c                       sx   e Zd ZdZ fddZeeedd
ej	e
ej e
ej e
ej e
ej e
ej	 eeeeej	 f ddd	Z  ZS )	SQuADHeadz
    A SQuAD head inspired by XLNet.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
            to use.
    c                    s<   t    |j| _|j| _t|| _t|| _t|| _	d S r,  )
r  r  start_n_top	end_n_topr  start_logitsr  
end_logitsr  answer_classr  r  rf   rg   r  X  s    


zSQuADHead.__init__)output_typer  NF)r  r  end_positionsr  is_impossibler  return_dictrv   c                 C   s  | j ||d}|dk	r|dk	r||||fD ]"}	|	dk	r*|	 dkr*|	d q*| j|||d}
t }|||}||
|}|| d }|dk	r|dk	r| j|||d}t }|||}||d 7 }|rt|d	S |fS |	 \}}}tj
j|dd
}tj|| jdd
\}}|ddd|}t|d|}|dd|dd}|d|}|dk	rf|dnd}| j|||d}
tj
j|
dd
}tj|| jdd
\}}|d| j| j }|d| j| j }td||}| j|||d}|s|||||fS t|||||dS dS )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                Final hidden states of the model on the sequence tokens.
            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Positions of the first token for the labeled span.
            end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Positions of the last token for the labeled span.
            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
            is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Whether the question has a possible answer in the paragraph or not.
            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                should be masked.
            return_dict (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        Returns:
        )r  Nr   ri   )r  r  ra  )r  r  g      ?)r  r  r  )r  r  z
blh,bl->bh)r  r  )r  r  r  r  r  )r  rd  Zsqueeze_r   r   r  r   ZBCEWithLogitsLossr  r  r  Zsoftmaxr_   Ztopkr  r{  r~  r  Z	expand_asr  viewZeinsum)r[  r  r  r  r  r  r  r  r  r  r   Zloss_fctZ
start_lossZend_lossZ
total_lossr  Zloss_fct_clsZcls_lossZbszr  r  Zstart_log_probsr  r  Zstart_top_index_expr  Zhidden_states_expandedZend_log_probsr  r  rf   rf   rg   r  a  sd    


  

  
zSQuADHead.forward)NNNNNF)r   r  r  r  r  r@   r  r   r_   r  r
   r  r   r   r   r  r  rf   rf   r  rg   r  N  s&   		
      r  c                       sD   e Zd ZdZed fddZd	ejeej	 ejdddZ
  ZS )
SequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

                - `"last"` -- Take the last token hidden state (like XLNet)
                - `"first"` -- Take the first token hidden state (like Bert)
                - `"mean"` -- Take the mean of all tokens hidden states
                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
                - `"attn"` -- Not implemented now, use multi-head attention

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    r  c                    s   t    t|dd| _| jdkr&tt | _t|drv|jrvt|dr`|j	r`|j
dkr`|j
}n|j}t|j|| _t|dd }|rt|nt | _t | _t|dr|jdkrt|j| _t | _t|d	r|jdkrt|j| _d S )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   Zsummary_activationsummary_first_dropoutsummary_last_dropout)r  r  r  r  r   r   summaryr   r  r  Z
num_labelsr  r   r  r   r  first_dropoutr  ZDropoutlast_dropoutr  )r[  ru  Znum_classesZactivation_stringr  rf   rg   r    s$    

zSequenceSummary.__init__N)r  r  rv   c                 C   s  | j dkr|dddf }n| j dkr8|dddf }n| j dkrP|jdd}n| j d	kr|dkrtj|d
ddddf |jd d tjd}n2|dd}|d| d  |	df }|
d|d}n| j dkrt| |}| |}| |}| |}|S )ak  
        Compute a single vector summary of a sequence hidden states.

        Args:
            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

        Returns:
            `torch.FloatTensor`: The summary of the sequence hidden states.
        r	  Nri   firstr   meanr   r  r  .r  rb  )ri   r
  )r  r  r_   Z	full_likern  longr{  r~  rd  r  r  r  r   r  r  r  r  )r[  r  r  r  rf   rf   rg   r    s.    



"




zSequenceSummary.forward)Nr  rf   rf   r  rg   r    s     r  )r   rv   c                 C   s   t | drt| jS | S dS )z
    Recursively unwraps a model from potential containers (as used in distributed training).

    Args:
        model (`torch.nn.Module`): The model to unwrap.
    ru   N)r   r&  ru   )r   rf   rf   rg   r&  &  s    

r&  c                    s4   i }|   D ]"\ | fdd|D  q|S )zT
    Expand a device map to return the correspondance parameter name to device.
    c                    s*   i | ]"}|ks |  d r| qS r  r  r  r   ru   rf   rg   r  :  s
        z%expand_device_map.<locals>.<dictcomp>)r   r  )r;  Zparam_namesZnew_device_maprf   r  rg   r  4  s    r  c                 C   sr   t t}|d  D ]H\}}t|dkrL|| krLd|ddd }q|| | |  qdd | D S )zT
    Returns the list of shard files containing only weights offloaded to disk.
    r   r   r   Nri   c                 S   s"   g | ]\}}t |d hkr|qS )r1  )r   )ry   fnameZdevicesrf   rf   rg   r|   H  s      z-get_disk_only_shard_files.<locals>.<listcomp>)r.  r/  r   r   r   r   r  r   )r;  r|  Zfiles_contentr  r=  rf   rf   rg   r  >  s    
r  )T)TT)	NNNNNNFFN)N)r.  r   importlib.metadatar  r7  r   rc   r   r  r  rj  
contextlibr   dataclassesr   	functoolsr   r   typingr   r   r   r	   r
   r   r   r_   	packagingr   r   r   Ztorch.nnr   r   Zactivationsr   Zconfiguration_utilsr   Zdynamic_module_utilsr   Z
generationr   r   r4  r   r   r   Zpytorch_utilsr   r   r   r   r   r    r!   utilsr"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   Z	utils.hubrB   rC   Zutils.import_utilsrD   rE   rF   rG   Zutils.quantization_configrH   rI   rJ   Zutils.versionsrK   rd   re   upperrL   rN   rv  rO   rP   rQ   Zaccelerate.hooksrR   r  rS   rT   rU   rV   rW   rX   rY   rZ   Zsafetensorsr[   Zsafetensors.torchr\   r   r]   r7  Z
get_loggerr   r   ro   rh   rk   Z!smdistributed.modelparallel.torchZmodelparallelr+  Zsmdistributed.modelparallelrm   ZSMP_VERSIONr  r*  rn   rq   r   r   r   r   r   r   r   r   rj   r   r   r  r   r   r  r"  r)  rF  rH  rr   r  r  r  r   r  r  r  r  r  r  r&  r  r  rf   rf   rf   rg   <module>   s   $$	"(
1  
 S
T2=          
 	  2                      e  
&EAuc
