U
    0-e2                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZmZmZmZmZ d dlZd dlmZ ddlmZ ddlmZmZ ddlmZmZmZ dd	lmZmZm Z m!Z! dd
l"m#Z#m$Z$m%Z% ddl&m'Z'm&Z& eddrd dl(Z(e  r$d dl)m*Z* d dl+m,Z- dZ.e/e0Z1ee2e3f dddZ4ej5dddZ6ej7eej8e2e2f dddZ9defee3ej7f ee2e3f e3dddZ:dWej;e3ee2e3ej8f eej7 eee3ej5f  eej< ddd Z=dXej;e>e>d"d#d$Z?G d%d& d&e@ZAej;d'd(d)ZBd*d+ ZCd,d- ZDej;d'd.d/ZEd0d1 ZFee3ej8f ej5d2d3d4ZGdYej;eee3ej8f  eee3ee3ej8f f  d5d6d7ZHeee3ejj;f  ee3e2f ee3 d8d9d:ZIdZeeee2e3f ee2e3f f  d;d<d=ZJd[ee3ee2e3ej8f f e3d?d@dAZKdBdC ZLd\ej;eeee2e3f ee2e3f f  eee3  eee3ej5f  eee3ee3ej8f f  e>dDdEdFZMejj;d'dGdHZNd]ej;eeee2e3f ee2e3f f  eee3  eee3ej5f  eee3ee3ej5f f  e>dIdJdKZOej;ee3ee2e3ej8f f dLdMdNZPd^dOdPZQd_ej;ee3ejRf eee3ee2e3ej8f f  eee3ejRf  eee3ej5f  e>e>ee3 e>dQ	dRdSZSd`e>edTdUdVZTdS )a    N)defaultdict)DictListOptionalTupleUnion   )AcceleratorState   )SAFE_WEIGHTS_NAMEWEIGHTS_NAME)AutocastKwargsCustomDtypeDistributedType)is_mps_availableis_npu_availableis_safetensors_availableis_xpu_available)load_offloaded_weightoffload_weightsave_offload_index)is_tqdm_availabletqdmF)Zcheck_device)	safe_open)	load_filezpytorch_model.bin.index.json)sizec                 C   s  d}d|  d}zPt | tr&| }n:|  drPtt| dd d }n|  drxtt| dd d	 }n|  d
rtt| dd d }n|  drtt| dd d }| dr|d n|}n|  dr tt| dd d }| dr|d n|}n@|  dr`tt| dd d }| dr\|d n|}W n tk
r   t|Y nX |dkrt||S )a(  
    Converts a size expressed as a string with digits an unit (like `"5MB"`) to an integer (in bytes).

    Args:
        size (`int` or `str`): The size to convert. Will be directly returned if an `int`.

    Example:

    ```py
    >>> convert_file_size_to_int("1MiB")
    1048576
    ```
    r   z`size` z] is not in a valid format. Use an integer for bytes, or a string with an unit (like '5.0GB').ZGIBNi   @ZMIBi   ZKIBi   GBi ʚ;b   MBi@B ZKBi  )
isinstanceintupperendswithfloat
ValueError)r   Zmem_sizeerr_msgZint_size r)   Z/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/accelerate/utils/modeling.pyconvert_file_size_to_int4   s4    


r+   dtypec                 C   sj   | t jkrdS | tjkrdS | tjkr*dS tdt| }|dkrRtd|  dt	|
 d }|d	 S )
z
    Returns the size (in bytes) occupied by one parameter of type `dtype`.

    Example:

    ```py
    >>> dtype_byte_size(torch.float32)
    4
    ```
    g      ?g      ?r
   z[^\d](\d+)$Nz`dtype` is not a valid dtype: .r   r    )torchboolr   ZINT4ZFP8researchstrr'   r#   groups)r-   Z
bit_searchZbit_sizer)   r)   r*   dtype_byte_size`   s    


r5   )tensorreturnc                 C   s   t jdt jdt jdt jdt jdt jdt jdt jdt j	dt j
di
}z|   }|   }W nf tk
r   z&|   }|   || j  }W n* tk
r   d}|  || j  }Y nX Y nX | j||fS )a  
    Unique identifier to a tensor storage. Multiple different tensors can share the same underlying storage. For
    example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is
    guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with
    non-overlapping lifetimes may have the same id.
    r       r   r
   r   )r/   Zint64float32Zint32bfloat16float16Zint16Zuint8int8r0   Zfloat64Zuntyped_storageZdata_ptrnbytes	ExceptionZstorager   r-   NotImplementedErrorZnelementdevice)r6   Z_SIZEZstorage_ptrZstorage_sizer)   r)   r*   id_tensor_storagex   s@              rA   Z10GB)
state_dictmax_shard_sizeweights_namec              	   C   st  t |}i g}d}d}i }|  D ]\}}t|tr8q"nt|}	|	|kr^||	 }
|||
 |< q"| t|j }|| |kr|i  d}||d |< ||7 }||7 }t	|d ||	< q"t	|dkr||d idfS i }i }t
|D ]t\}}|dd|d ddt	|dd}|d	d|d ddt	|dd	}|||< | D ]}|||< qHqd
|i}||d}||fS )a  
    Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a
    given size.

    The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no
    optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the
    limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB],
    [6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB].

    <Tip warning={true}>

    If one of the model's weight is bigger that `max_sahrd_size`, it will end up in its own sub-checkpoint which will
    have a size greater than `max_shard_size`.

    </Tip>

    Args:
        state_dict (`Dict[str, torch.Tensor]`): The state dictionary of a model to save.
        max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
            The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit
            (like `"5MB"`).
        weights_name (`str`, *optional*, defaults to `"pytorch_model.bin"`):
            The name of the model save file.
    r   r
   Nz.bin-Z05dz-of-.safetensors
total_size)metadata
weight_map)r+   itemsr"   r3   rA   numelr5   r-   appendlen	enumeratereplacekeys)rB   rC   rD   Zsharded_state_dictsZlast_block_sizerH   Zstorage_id_to_blockkeyweightZ
storage_idZblock_idZweight_sizerJ   ZshardsidxZshardZ
shard_filerI   indexr)   r)   r*   shard_checkpoint   sJ    

& 
rV   )moduletensor_namer@   valuer-   fp16_statisticsc              	   C   s  d|krV| d}|dd D ].}t| |}|dkrHt|  d| d|} q|d }|| jkr~|| jkr~t|  d| d|| jk}	t| |}
|
jtdkr|dtdfkr|dkrt| d| d|dk	r<|
j|jkrtd|j d	| d
|
j d|dkr ||
j	}nt
|j	ds<||}|| jkrR| j| nd}t|}d}t & |dk	r|jjdkrt|jdkr|jdkr|}d}|dkr|
|}|dk	r,|dtdfkr,||}|	s,|||
jd| j|< n(t|tjr||}ntj||d}|dk	r:|}|	rN|| j|< n@|dk	spt|| j| jkrt| j| }| j| j}|jdkr0|jdkr|j	tjkr|tj}|dkr|jdkr||fd|
ji|dd}|jd|_|jd|_n||fd|
ji||}n|||
jd|}|| j|< |dk	rrt| j| d|| ~| jjdkr"t| jdddkr"t
| jjdkr"t|jdkrt|jnd}t| jdds|dk	r| jdk	r
| jjjdkr
| |} n| jdkr| |} nl| jjdkrt| jdddkrt|jdkrbt|jnd}t| jdds|dk	r| j|| _W 5 Q R X tj  dS )a  
    A helper function to set a given tensor (parameter of buffer) of a module on a specific device (note that doing
    `param.to(device)` creates a new tensor not linked to the parameter, which is why we need this function).

    Args:
        module (`torch.nn.Module`):
            The module in which the tensor we want to move lives.
        param_name (`str`):
            The full name of the parameter/buffer.
        device (`int`, `str` or `torch.device`):
            The device on which to set the tensor.
        value (`torch.Tensor`, *optional*):
            The value of the tensor (useful when going from the meta device to any other device).
        dtype (`torch.dtype`, *optional*):
            If passed along the value of the parameter will be cast to this `dtype`. Otherwise, `value` will be cast to
            the dtype of the existing parameter in the model.
        fp16_statistics (`torch.HalfTensor`, *optional*):
            The list of fp16 statistics to set on the module, used for 8 bit model serialization.
    r.   NrE   z has no attribute z- does not have a parameter or a buffer named metaz7 is on the meta device, we need a `value` to put in on z Trying to set a tensor of shape z in "z" (which has shape z), this look incorrect.)z
torch.uintz	torch.intz
torch.boolcuda)
Int8ParamsZ	FP4Paramscpu)requires_gradr@   r]   r_   r   SCBZLinear8bitLtZ
Linear4bitZquant_state)splitgetattrr'   _parameters_buffersr@   r/   shapetor-   r3   
startswithtypeZno_grad__name__r_   r"   Tensorr6   __dict__r9   r;   ZCBra   setattr	__class__rS   rU   Zbiasr\   Zempty_cache)rW   rX   r@   rY   r-   rZ   splitsrb   Z
new_moduleZ	is_buffer	old_valueparamZ	param_clsZdevice_quantization	new_valuekwargsZdevice_indexr)   r)   r*   set_module_tensor_to_device   s    



*







""


"""rt   T)rW   include_buffersrecursec                 c   s8   | j |dD ]
}|V  q|r4| j|dD ]
}|V  q(dS )aq  
    A helper function that gathers all the tensors (parameters + buffers) of a given module. If `include_buffers=True`
    it's the same as doing `module.named_parameters(recurse=recurse) + module.named_buffers(recurse=recurse)`.

    Args:
        module (`torch.nn.Module`):
            The module we want the tensors on.
        include_buffer (`bool`, *optional*, defaults to `True`):
            Whether or not to include the buffers in the result.
        recurse (`bool`, *optional`, defaults to `False`):
            Whether or not to go look in every submodule or just return the direct parameters and buffers.
    rv   N)named_parametersnamed_buffers)rW   ru   rv   Znamed_parameterZnamed_bufferr)   r)   r*   named_module_tensorsp  s
    rz   c                       s(   e Zd ZdZ fddZdd Z  ZS )FindTiedParametersResultz
    This is a subclass of a list to handle backward compatibility for Transformers. Do not rely on the fact this is not
    a list or on the `values` method as in the future this will be removed.
    c                    s   t  j|| d S N)super__init__)selfargsrs   rn   r)   r*   r~     s    z!FindTiedParametersResult.__init__c                 C   s   t dd | D g S )Nc                 S   s   g | ]}|d d qS )r
   Nr)   ).0xr)   r)   r*   
<listcomp>  s     z3FindTiedParametersResult.values.<locals>.<listcomp>)sum)r   r)   r)   r*   values  s    zFindTiedParametersResult.values)rj   
__module____qualname____doc__r~   r   __classcell__r)   r)   r   r*   r{     s   r{   )modelc                 C   s   d}d}d}ddd t | jD krt| doDt| jddoD|  }t| dojt| jddojt| jdd}td	d
 |  D }t|||gS )z
    Check if there is any indication in the given model that some weights should be tied.

    Args:
        model (`torch.nn.Module`): The model to inspect

    Returns:
        bool: True if the model needs to have tied weights
    FZPreTrainedModelc                 S   s   g | ]
}|j qS r)   )rj   )r   cr)   r)   r*   r     s     z3check_tied_parameters_in_config.<locals>.<listcomp>configZtie_word_embeddingsZis_encoder_decoderZtie_encoder_decoderc                 s   s   | ]}t |d V  qdS )Z_tie_weightsN)hasattr)r   rW   r)   r)   r*   	<genexpr>  s     z2check_tied_parameters_in_config.<locals>.<genexpr>)	inspectgetmrorn   r   rc   r   Zget_output_embeddingsanymodules)r   Zhas_tied_word_embeddingZhas_tied_encoder_decoderZhas_tied_moduler)   r)   r*   check_tied_parameters_in_config  s     

r   c                 C   sP   | |kr||  S d | dd d }|| krBtd|  dn
t||S d S )Nr.   rE   z-The `device_map` does not contain the module )joinrb   r'   _get_param_device)rq   
device_mapZparent_paramr)   r)   r*   r     s    r   c                 C   sP   | D ]F}i }|D ]}t ||||< qtt| dkrtd| d qdS )a9  
    Check if tied parameters are on the same device

    Args:
        tied_params (`List[List[str]]`):
            A list of lists of parameter names being all tied together.

        device_map (`Dict[str, Union[int, str, torch.device]]`):
            A map that specifies where each submodule should go.

    r
   z*Tied parameters are on different devices: zC. Please modify your custom device map or set `device_map='auto'`. N)r   rN   setr   loggerwarn)tied_paramsr   Z	tie_paramZtie_param_devicesrq   r)   r)   r*   $check_tied_parameters_on_same_device  s    
r   c                 K   s   | dd}| dd}| di }|dkr@dd |  D }nl|  D ]b\}}|dkr\|n| d| }||krH| D ].\}}	|	|krz||krg ||< || | qzqH|  D ]2\}}
|dkr|n| d| }t|
|||d	 qtd
d | D S )a  
    Find the tied parameters in a given model.

    <Tip warning={true}>

    The signature accepts keyword arguments, but they are for the recursive part of this function and you should ignore
    them.

    </Tip>

    Args:
        model (`torch.nn.Module`): The model to inspect.

    Returns:
        List[List[str]]: A list of lists of parameter names being all tied together.

    Example:

    ```py
    >>> from collections import OrderedDict
    >>> import torch.nn as nn

    >>> model = nn.Sequential(OrderedDict([("linear1", nn.Linear(4, 4)), ("linear2", nn.Linear(4, 4))]))
    >>> model.linear2.weight = model.linear1.weight
    >>> find_tied_parameters(model)
    [['linear1.weight', 'linear2.weight']]
    ```
    rx   Nprefix resultc                 S   s   i | ]\}}||qS r)   r)   )r   npr)   r)   r*   
<dictcomp>  s      z(find_tied_parameters.<locals>.<dictcomp>r.   )rx   r   r   c                 S   s&   g | ]\}}t |gtt| qS r)   )sortedlistr   )r   rS   Ztiedr)   r)   r*   r     s     z(find_tied_parameters.<locals>.<listcomp>)getrx   rK   rM   named_childrenfind_tied_parametersr{   )r   rs   rx   r   r   nameZ	parameterZ	full_namenew_name	new_paramchildZ
child_namer)   r)   r*   r     s"    r   c                 C   sn   |D ]d}d}|D ]V}| }| d}|dd D ]}t||}q.|dkrVt||d }qt||d | qqdS )aX  
    Reties tied parameters in a given model if the link was broken (for instance when adding hooks).

    Args:
        model (`torch.nn.Module`):
            The model in which to retie parameters.
        tied_params (`List[List[str]]`):
            A mapping parameter name to tied parameter name as obtained by `find_tied_parameters`.
    Nr.   rE   )rb   rc   rm   )r   r   
tied_groupZparam_to_tie
param_namerW   ro   rb   r)   r)   r*   retie_parameters  s    

r   )r-   r7   c                 C   s$   t | tr | dd} tt| } | S )z4
    Just does torch.dtype(dtype) if necessary.
    torch.r   )r"   r3   rP   rc   r/   r,   r)   r)   r*   _get_proper_dtype&  s    

r   )r   r-   special_dtypesc                 C   s   |dk	rt |}t|}|dk	rDdd | D }dd | D }tt}t| ddD ]\}}|dk	r||kr| ||  }n4|dkr| t|j }n| t|t|j }|	d}	t
t|	d D ]"}
|d|	d|
   |7  < qqX|S )	z>
    Compute the size of each submodule of a given model.
    Nc                 S   s   i | ]\}}|t |qS r)   )r   r   rR   Zdtypr)   r)   r*   r   =  s      z(compute_module_sizes.<locals>.<dictcomp>c                 S   s   i | ]\}}|t |qS r)   )r5   r   r)   r)   r*   r   >  s      Trw   r.   r
   )r   r5   rK   r   r#   rz   rL   r-   minrb   rangerN   r   )r   r-   r   Z
dtype_sizeZspecial_dtypes_sizemodule_sizesr   r6   r   Z
name_partsrT   r)   r)   r*   compute_module_sizes1  s"    
"r   )r   r   no_split_module_classesc           	         s   d}g }|   }t|dkr|d\ }t|tjjrDt| ng }t|dks`|j	j
|kr|  }||kr||} g}q||kr|  q fdd|D | }q||fS )aO  
    Utility function that will scan a list of named modules and return the maximum size used by one full layer. The
    definition of a layer being:
    - a module with no direct children (just parameters and buffers)
    - a module whose class name is in the list `no_split_module_classes`

    Args:
        modules (`List[Tuple[str, torch.nn.Module]]`):
            The list of named modules where we want to determine the maximum layer size.
        module_sizes (`Dict[str, int]`):
            A dictionary mapping each layer name to its size (as generated by `compute_module_sizes`).
        no_split_module_classes (`List[str]`):
            A list of class names for layers we don't want to be split.

    Returns:
        `Tuple[int, List[str]]`: The maximum size of a layer with the list of layer names realizing that maximum size.
    r   c                    s"   g | ]\}}  d | |fqS r.   r)   r   r   vmodule_namer)   r*   r   q  s     z&get_max_layer_size.<locals>.<listcomp>)copyrN   popr"   r/   nnModuler   r   rn   rj   rM   )	r   r   r   max_sizeZlayer_namesmodules_to_treatrW   modules_childrenr   r)   r   r*   get_max_layer_sizeN  s    r   
max_memoryc           
   	      s  ddl } dkrtj s&t s&i  nt sjttj D ]}tjdg|d}q:dd ttj D  nBttj D ]}tjdt	d|d}qxdd ttj D  t
 r| j d< n| j d	<  S  D ]"}t | trt |  |< qd
d   D }|  t r(tj ntj }|D ]8}||ksN|dk r6td| dtt|  q6| fdddD  }  D ] }	|	|krtd|	 dq fdd|D   S )zb
    Get the maximum memory available if nothing is passed, converts string to int otherwise.
    r   Nr`   c                 S   s   i | ]}|t j|d  qS r   )r/   r\   Zmem_get_infor   ir)   r)   r*   r     s      z"get_max_memory.<locals>.<dictcomp>xpuc                 S   s   i | ]}|t j|qS r)   )r/   r   Zmax_memory_allocatedr   r)   r)   r*   r     s      mpsr^   c                 S   s   g | ]}t |tr|qS r)   r"   r#   r   kr)   r)   r*   r     s     
 z"get_max_memory.<locals>.<listcomp>zDevice z) is not available, available devices are c                    s   g | ]}|   kr|qS r)   )rQ   r   r   r)   r*   r     s      )r   r^   diskzX is not recognized, available devices are integers(for GPU/XPU), 'mps', 'cpu' and 'disk'c                    s   i | ]}| | qS r)   r)   r   r   r)   r*   r     s      )psutilr/   r\   Zis_availabler   r   Zdevice_countr6   r   r@   r   Zvirtual_memory	availabler"   r3   r+   rQ   sortr   warningr   r'   )
r   r   r   _rR   Zgpu_devicesnum_devicesr@   Zall_devicesr   r)   r   r*   get_max_memoryu  s@    "

r   r   )r   r   c                    s   dkrdn dfdd|   D }tt|dkrrt|dkrrfdd| D D ]
}| |= qZ|d | < fdd|  D }tdkrtdd nd t fd	d
|D }|D ]}t| |d q| S )z]
    Cleans a device_map by grouping all submodules that go on the same device together.
    r   r.   c                    s   g | ]\}}|  r|qS r)   rh   )r   r   r   r   r)   r*   r     s     
 z$clean_device_map.<locals>.<listcomp>r
   c                    s   g | ]}|  r|qS r)   r   r   r   r)   r*   r     s     
 r   c                    s*   g | ]"}| rt|t kr|qS r)   )rh   rN   r   )r   r   r)   r*   r     s     
  c                 3   s&   | ]}d  |d d  V  qdS )r.   N)r   rb   r   )rT   r)   r*   r     s     z#clean_device_map.<locals>.<genexpr>r   )rK   rN   r   rQ   rb   clean_device_map)r   r   r   r   Zchildren_modulesr   r)   )rT   r   r   r*   r     s    "r   c           	      C   s   |dkst |dkrdS | D ]\}}d|kr2q d}d|kr||dd| kr||dd}ttj|| d|| }tj|| d}t||}t| |d||d q dS )a  
    Loads the weights from the offload folder into the model.

    Args:
        model (`torch.nn.Module`):
            The model to load the weights into.
        index (`dict`):
            A dictionary containing the parameter name and its metadata for each parameter that was offloaded from the
            model.
        offload_folder (`str`):
            The folder where the offloaded weights are stored.
    Nr   ra   rS   z.datr^   )rY   rZ   )	rN   rK   rP   rQ   r   ospathr   rt   )	r   rU   offload_folderr   rI   rZ   Zweight_nameZtensor_filerS   r)   r)   r*   load_offloaded_weights  s     
r   )r   r   r   r-   r   low_zeroc                    s  dk}t t s.tfddD }ntfddD }|dkrPS |dkrd}|r D ]4}t|trh|  d9  < td	| d
  qqht| ||dd |r|d n| }	|dkrg }nt|t	t
fs|g}t|dkri }
 D ]x\}}|dkrq | }|dD ]}t||}q$|jj}||krZ||
krZ||
|< t|
 t|kr  qzq t|
dkrt|
 nd}nd}fddD   fdd D fddD  ttfdd D tt d }tdt|| }|	|7 }	t	tdd  D }|dd D ]0}t|r^|dkr^d n|	| |< q@|rtdd tfddtd|D  }t|d d< S )aR  
    Compute a `max_memory` dictionary for [`infer_auto_device_map`] that will balance the use of each available GPU.

    <Tip>

    All computation is done analyzing sizes and dtypes of the model parameters. As a result, the model can be on the
    meta device (as it would if initialized within the `init_empty_weights` context manager).

    </Tip>

    Args:
        model (`torch.nn.Module`):
            The model to analyze.
        max_memory (`Dict`, *optional*):
            A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset.
        no_split_module_classes (`List[str]`, *optional*):
            A list of layer class names that should never be split across device (for instance any layer that has a
            residual connection).
        dtype (`str` or `torch.dtype`, *optional*):
            If provided, the weights will be converted to that type when loaded.
        special_dtypes (`Dict[str, Union[str, torch.device]]`, *optional*):
            If provided, special dtypes to consider for some specific weights (will override dtype used as default for
            all weights).
        low_zero (`bool`, *optional*):
            Minimizes the number of weights on GPU 0, which is convenient when it's used for other operations (like the
            Transformers generate function).
    Nc                    s,   g | ]$}t |jd kr | dkr|qS )r\   r   )r/   r@   ri   r   dr   r)   r*   r     s       z'get_balanced_memory.<locals>.<listcomp>c                    sF   g | ]>}|d krt |jdks2t j|jdkr | dkr|qS )r^   r   Zgpur   )r/   r@   ri   r   Zget_device_propertiesZdev_typer   r   r)   r*   r   	  s    r   r
   Fg?z(We will use 90% of the memory on device z for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).r-   r   r   r.   c                    s*   g | ]" t  fd dD dkr qS )c                    s&   g | ]} d ks|  d r|qS r   r.   r   r   r   r   r)   r*   r   F  s       2get_balanced_memory.<locals>.<listcomp>.<listcomp>r   rN   r   r   r   r*   r   F  s      c                    s   i | ]\}}| kr||qS r)   r)   r   )leavesr)   r*   r   G  s       z'get_balanced_memory.<locals>.<dictcomp>c                    s*   g | ]" t  fd dD dkr qS )c                    s&   g | ]} d ks|  d r|qS r   r   r   r   r)   r*   r   I  s       r   r   r   r   r   r   r*   r   I  s      c                    s   g | ]} | qS r)   r)   r   r   r   r)   r*   r   J  s     g      ?c                 s   s(   | ] \}}t |tr|d kr|V  qdS )r   Nr   )r   Z	device_idZ
device_memr)   r)   r*   r   P  s    
  z&get_balanced_memory.<locals>.<genexpr>rE   c                    s   g | ]} | qS r)   r)   r   r   r)   r*   r   Y  s     )r   r   rN   rQ   r"   r#   r   infor   r   tuplerK   rb   rc   rn   rj   r   maxr   r   r   r   r   )r   r   r   r-   r   r   Zuser_not_set_max_memoryr   rR   Zper_gpuZno_split_childrenr   r   	submoduleZsubmodule_name
class_namebufferZmean_leavesZgpus_idx_listrT   Zmin_zeror)   )r   r   r   r*   get_balanced_memory  sv    $




 (.*r   c                 C   sh   t | }t| dd}|dkr g }t| jddt|   t| jdd }t|||}|d }||fS )z:Computes the total size of the model and its largest layerZ_no_split_modulesNFrw   r   )r   rc   r   rx   r   ry   r   )r   sizesZno_split_modulesr   Zlargest_layerrH   r)   r)   r*   calculate_maximum_sizes_  s    
r   )r   r   r   r-   r   verbosec                    sL  t |}|dkrg }nt|ttfs*|g}t| }d|krH|d dd |D }d|krfdg}n t|dkr|d dg}ndg}t| ||d}	t| }
t	| rt|
dkrt
d	 i }d}d}t| jd
dt|   t| jd
d }t||	|\}}t|dkrD|d\ }|r4td  d  fdd|D }t|dkrntdd |D |	|\}}|	  } fdd|
D }|rt|dkrtd|  t fdd|D g }|rt|dkrtd|  || }|dkr|| nd}|| |kr|| }|dk	r"|| |kr"t|tjrBg n
t| }|r~td||  d  d||  d| d	 t|dks|jj|kr|rtd |d7 } |fg| }d}nZ|rtd  d t|jd
d| } fdd|D | }tdd |D |	|\}}qt|dkrg }g }|D ]Dfddt|D d }||| d  ||| d  q<|rtd  d ||  d!| d"|  |}t||D ]\||	 |	  7 }q|dks|| |kr|rtd#  d$| d%||  d ||7 }|| | < |D ]Jd&d |D krlfd'dt|D d }|| || |< q0q@|rtd||  d  d$| d||  d(| d d
}t||D ]\}t| }t|dks|jj|krq|rtd d t|jd
d| }fd)d|D }fd*dt|D d } |fg|d|  | ||d d  }td+d |D |	|\}}d,} qq|s@|rtd- |d7 } |fg| }d}np|r,|dkrtd#  d.| d/||  d n*td#  d.| d/||  d0||  d	 ||7 }|| | < qt|S )1a  
    Compute a device map for a given model giving priority to GPUs, then offload on CPU and finally offload to disk,
    such that:
    - we don't exceed the memory available of any of the GPU.
    - if offload to the CPU is needed, there is always room left on GPU 0 to put back the layer offloaded on CPU that
      has the largest size.
    - if offload to the CPU is needed,we don't exceed the RAM available on the CPU.
    - if offload to the disk is needed, there is always room left on the CPU to put back the layer offloaded on disk
      that has the largest size.

    <Tip>

    All computation is done analyzing sizes and dtypes of the model parameters. As a result, the model can be on the
    meta device (as it would if initialized within the `init_empty_weights` context manager).

    </Tip>

    Args:
        model (`torch.nn.Module`):
            The model to analyze.
        max_memory (`Dict`, *optional*):
            A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset.
        no_split_module_classes (`List[str]`, *optional*):
            A list of layer class names that should never be split across device (for instance any layer that has a
            residual connection).
        dtype (`str` or `torch.dtype`, *optional*):
            If provided, the weights will be converted to that type when loaded.
        special_dtypes (`Dict[str, Union[str, torch.device]]`, *optional*):
            If provided, special dtypes to consider for some specific weights (will override dtype used as default for
            all weights).
        verbose (`bool`, *optional*, defaults to `False`):
            Whether or not to provide debugging statements as the function builds the device_map.
    Nr   c                 S   s   g | ]}|d kr|qS ))r^   r   r)   r   r@   r)   r)   r*   r     s      z)infer_auto_device_map.<locals>.<listcomp>r   r   r^   r   rThe model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.Frw   z
Treating module r.   c                    s&   g | ]}| kr|  d  s|qS r   r   r   r   r)   r*   r     s       c                 S   s&   g | ]\}}t |tjjr||fqS r)   r"   r/   r   r   r   r   mr)   r)   r*   r     s      c                    s<   g | ]4}t  fd d|D rt fd d|D s|qS )c                 3   s   | ]} |kV  qd S r|   r)   r   r   r)   r*   r     s     z3infer_auto_device_map.<locals>.<listcomp>.<genexpr>)r   allr   r   r   r)   r*   r     s    z'  Found the relevant tied param groups c                    s   g | ]} fd d|D qS )c                    s   g | ]} |kr|qS r)   r)   r   r   r)   r*   r     s      z4infer_auto_device_map.<locals>.<listcomp>.<listcomp>r)   r   r   r)   r*   r     s     z4  So those parameters need to be taken into account zNot enough space on z to put z (space available z, module size z).z6This module cannot be split, going to the next device.r
   z
Splitting c                    s"   g | ]\}}  d | |fqS r   r)   r   r   r)   r*   r     s     c                 S   s&   g | ]\}}t |tjjr||fqS r)   r   r   r)   r)   r*   r     s      c                    s    g | ]\}\}}| kr|qS r)   r)   r   r   r   r   )
tied_paramr)   r*   r   	  s    
  z  It looks like z is going to fit on z7 but we have tied parameters to account for.
  - Names z
  - Module names zPutting z and z on c                 S   s   g | ]}|d  qS r   r)   )r   r   r)   r)   r*   r     s     c                    s    g | ]\}\}}| kr|qS r)   r)   r   tied_module_namer)   r*   r      s    
  z, needed size c                    s"   g | ]\}}  d | |fqS r   r)   r   r   r)   r*   r   8  s     c                    s    g | ]\}\}}| kr|qS r)   r)   r   r   r)   r*   r   9  s    
  c                 S   s&   g | ]\}}t |tjjr||fqS r)   r   r   r)   r)   r*   r   C  s      Tz?None of the tied module can be split, going to the next device.z (size=z) on z (available=)r   r"   r   r   rQ   rM   rN   r   r   r   r   r   rx   r   ry   r   r   printr   r   	Parameterrn   rj   rO   zipr   )r   r   r   r-   r   r   devicesZgpusZmain_devicesr   Ztied_parametersr   Zcurrent_deviceZcurrent_memory_usedr   Zmax_layer_sizeZmax_layer_namesrW   Zmodule_sizeZtied_param_goupsr   r@   Zcurrent_max_sizer   Ztied_module_namesZtied_modulesZtied_module_indexZmodule_size_with_tiesZsplit_happenedZtied_moduleZtied_module_childrenr)   )r   r   r   r*   infer_auto_device_mapq  s"   *


$ 
*


"$r  )r   r   c                    st   dd |    D }| D ],  dkr8|   qLq fdd|D }qt|dkrpd|}td| dS )	z
    Checks a device map covers everything in a given model.

    Args:
        model (`torch.nn.Module`): The model to check the device map against.
        device_map (`Dict[str, Union[int, str, torch.device]]`): The device map to check.
    c                 S   s   g | ]\}}|qS r)   r)   r   r   r   r)   r)   r*   r   i  s     z$check_device_map.<locals>.<listcomp>r   c                    s&   g | ]}| ks|  d  s|qS r   r   )r   r   r   r)   r*   r   o  s    r   z, zOThe device_map provided does not give any device for the following parameters: N)rB   rK   rQ   clearrN   r   r'   )r   r   Zall_model_tensorsZnon_covered_paramsr)   r   r*   check_device_mapa  s    

r  c           
   
      sT  |  dr<t s"td|  dt| dd}| }| }W 5 Q R X |dkrltd|  d d	di}|d	d
krt	d|  dn |d	 dkrt
d|d	  d|dkrt| S tt| dkrt| t| d dS tt| dh }d|kr|d dd |D  | D ]0\}||kr( | fdd|D  q( d  fdd|D  i }t rtdt fdd|D dddd}nd}|D ]r}t| d|dX} | D ]H}	|dk	r|j|dd ||	 ||	||	< |dk	r|  qW 5 Q R X q|dk	r6|  |S ntj| tddS dS )a3  
    Load a checkpoint from a given file. If the checkpoint is in the safetensors format and a device map is passed, the
    weights can be fast-loaded directly on the GPU.

    Args:
        checkpoint_file (`str`): The path to the checkpoint to load.
        device_map (`Dict[str, Union[int, str, torch.device]]`, *optional*):
            A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
            name, once a given module name is inside, every submodule of it will be sent to the same device.
    rG   zTo load zC, the `safetensors` library is necessary `pip install safetensors`.pt)	frameworkNz"The safetensors archive passed at zx does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.format)r	  tfZflaxzf does not contain the valid metadata. Make sure you save your model with the `save_pretrained` method.z%The checkpoint passed was saved with z, we need a the pt format.r
   r   r`   r   r^   c                 S   s   i | ]
}|g qS r)   r)   r   r)   r)   r*   r     s      z#load_state_dict.<locals>.<dictcomp>c                    s&   g | ]}| ks|  d  r|qS r   r   r   r   r)   r*   r     s       z#load_state_dict.<locals>.<listcomp>c                    s"   g | ]}|t   g kr|qS r)   )r   r   r   device_weightsr)   r*   r     s      Fc                    s   g | ]}t  | qS r)   r   r   r  r)   r*   r     s     w)Zmain_process_onlytotalunitZ	smoothingZleave)r
  r@   )devrefresh)Zmap_location)r%   r   ImportErrorr   rI   rQ   r   r   r   OSErrorr'   safe_load_filerN   r   r   r   rM   rK   extendr   r   r   Zset_postfixset_descriptionZ
get_tensorupdatecloser/   loadr@   )
checkpoint_filer   frI   Zweight_namesr  r@   Ztensorsprogress_barrR   r)   )r  r   r*   load_state_dict{  sp    









r  )	r   
checkpointr   r   r-   offload_state_dictoffload_bufferskeep_in_fp32_modulesoffload_8bit_bnbc	              
      s  |rddl m}	 t| }
t| r6t|
dkr6td t|
| |dkrf|dk	rfd| krft	dn*|dk	r|dk	rd| krt
j|dd	 t|tr|d
d}tt|}d}d}t
j|rt|dr|}n|g}nt
j|rdd t
|D }dd t
|D }t|dkr:t
j||d g}nt|dkr^t
j||d g}nndd t
|D }t|dkrt	| dt dt dn0t|dkrt
j||d }nt	| dnt	d| d|dk	rTt
j|d  t|d}t| }W 5 Q R X d|kr.|d }ttt | } fdd|D }i }|rjt!" }i }dd | # D }|D ]X}t$||d}|dkr| j$|dd n |% D ]\}}d|krΐq|}t|dkr||krd|ddd }q|dkr(d|kr(t	| d || }|}|dk	rt&|r|dk	r|tj'krd}|D ]2}||kr|d |ks||krhd} qqh|rtj(}d!|kr|d!d|) kr|j*tj+kr||d!d }nd}|dkrX|s||kr|dkr|j*}|r6|	| |||||| qnt,| |d"|d# t-||||d$ nr|d%kr|r|dkrx|j*}|r|	| |||||| n t,| |d"|d# t-||||d$ nt,| |||||d& q~t./  qt0|| |rt1| || t23| t4| |
 dS )'a  
    Loads a (potentially sharded) checkpoint inside a model, potentially sending weights to a given device as they are
    loaded.

    <Tip warning={true}>

    Once loaded across devices, you still need to call [`dispatch_model`] on your model to make it able to run. To
    group the checkpoint loading and dispatch in one single call, use [`load_checkpoint_and_dispatch`].

    </Tip>

    Args:
        model (`torch.nn.Module`):
            The model in which we want to load a checkpoint.
        checkpoint (`str` or `os.PathLike`):
            The folder checkpoint to load. It can be:
            - a path to a file containing a whole model state dict
            - a path to a `.json` file containing the index to a sharded checkpoint
            - a path to a folder containing a unique `.index.json` file and the shards of a checkpoint.
            - a path to a folder containing a unique pytorch_model.bin or a model.safetensors file.
        device_map (`Dict[str, Union[int, str, torch.device]]`, *optional*):
            A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
            name, once a given module name is inside, every submodule of it will be sent to the same device.
        offload_folder (`str` or `os.PathLike`, *optional*):
            If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
        dtype (`str` or `torch.dtype`, *optional*):
            If provided, the weights will be converted to that type when loaded.
        offload_state_dict (`bool`, *optional*, defaults to `False`):
            If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
            the weight of the CPU state dict + the biggest shard does not fit.
        offload_buffers (`bool`, *optional*, defaults to `False`):
            Whether or not to include the buffers in the weights offloaded to disk.
        keep_in_fp32_modules(`List[str]`, *optional*):
            A list of the modules that we keep in `torch.float32` dtype.
        offload_8bit_bnb (`bool`, *optional*):
            Whether or not to enable offload of 8-bit modules on cpu/disk.

    r
   )quantize_and_offload_8bitr   r   Nr   zeAt least one of the model submodule will be offloaded to disk, please pass along an `offload_folder`.T)exist_okr   r   z.jsonc                 S   s   g | ]}|t kr|qS r)   )r   r   r  r)   r)   r*   r   !  s      z,load_checkpoint_in_model.<locals>.<listcomp>c                 S   s   g | ]}|t kr|qS r)   )r   r'  r)   r)   r*   r   "  s      c                 S   s   g | ]}| d r|qS )z.index.json)r%   r'  r)   r)   r*   r   )  s     
 z6 is not a folder containing a `.index.json` file or a z or a z filezI containing more than one `.index.json` file, delete the irrelevant ones.z`checkpoint` should be the path to a file containing a whole state dict, or the index of a sharded checkpoint, or a folder containing a sharded checkpoint or the whole state dict, but got r.   rrJ   c                    s   g | ]}t j |qS r)   )r   r   r   r'  Zcheckpoint_folderr)   r*   r   B  s     c                 S   s   g | ]\}}|qS r)   r)   r  r)   r)   r*   r   K  s     )r   F)strictra   rE   z doesn't have any device set.rS   r[   r,   )rU   r^   )rY   r-   rZ   )5Zbnbr%  r   r   rN   r   r   r   r   r'   r   makedirsr"   r3   rP   rc   r/   r   isfiler%   isdirlistdirr   r   r   rb   openjsonloadsreadr   r   r   tempfilemkdtempry   r  rK   Zis_floating_pointr;   r9   rQ   r-   r<   rt   r   gcZcollectr   r   shutilrmtreer   )r   r   r   r   r-   r!  r"  r#  r$  r%  r   Zcheckpoint_filesZindex_filenameZpotential_state_binZpotential_state_safetensorZpotential_indexr  rU   Zoffload_indexZstate_dict_folderZstate_dict_indexZbuffer_namesr  r   rq   r   Zparam_deviceZ	new_dtypeproceedrR   rZ   r)   r)  r*   load_checkpoint_in_model  s   1








"
 

      
      



r9  )
native_ampautocast_kwargsc                 C   s   t  }|dkri }n| }| r|jdkrFtjf |jjtjd|S |jdkr|jt	j
t	jt	jt	jt	jfkrtjf |jjtjd|S tjf d|jji|S nt S dS )aH  
    Return a context manager for autocasting mixed precision

    Args:
        native_amp (`bool`, *optional*, defaults to False):
            Whether mixed precision is actually enabled.
        cache_enabled (`bool`, *optional*, defaults to True):
            Whether the weight cache inside autocast should be enabled.
    NZfp16)device_typer-   Zbf16r<  )r	   Z	to_kwargsZmixed_precisionr/   Zautocastr@   ri   r;   Zdistributed_typer   NOZ	MULTI_CPUZ	MULTI_GPUZ	MULTI_NPUZ	MULTI_XPUr:   
contextlibnullcontext)r:  r;  stater)   r)   r*   #get_mixed_precision_context_manager  s"    

rA  )NNN)TF)NN)N)r   )NNNNF)NNNNF)N)NNNFFNF)FN)Ur>  r5  r   r0  loggingr   r1   r6  r3  collectionsr   typingr   r   r   r   r   r/   Ztorch.nnr   r@  r	   	constantsr   r   dataclassesr   r   r   Zimportsr   r   r   r   Zoffloadr   r   r   r   r   Z	torch_npuZsafetensorsr   Zsafetensors.torchr   r  ZWEIGHTS_INDEX_NAME	getLoggerrj   r   r#   r3   r+   r-   r5   rk   r@   rA   rV   r   Z
HalfTensorrt   r0   rz   r   r{   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  PathLiker9  rA  r)   r)   r)   r*   <module>   s   
,%  
 V     
:   
 ',5&      
      
 q&
W        O