U
    *-e                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZmZ d dlm	Z	m
Z
 d dlmZmZmZmZmZmZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dl m  m!Z" d dlm#Z# d dl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d dl+m,Z,m-Z-m.Z. d dl/m0Z0 d	d
l1m2Z2m3Z3 ddddddgZ4e5e6Z7dZ8dZ9dZ:dZ;G dd deZ<e<j=e<j>fZ?e<j@e<jAfZBG dd deZCG dd deZDG dd deZEG dd deZFG dd de0ZGG dd dejHeGdZIG dd dZJejKeLejHdd d!d"ZMejKeLe#dd#d$d%ZNejKeLee#ejHf d&d'd(ZOeeej#ejHf  eejH d)d*d+ZPeejHe#f e#d,d-d.ZQejRd/d0d1ZSeTd2d3d4 ZUeVejReWejXd5d6d7ZYeTd	ejZeLd8d9d:Z[dS );    N)autoEnum)
accumulatechain)AnyCallablecastDict	GeneratorIteratorList
NamedTupleno_type_checkOptionalSequenceSetTupleUnion)Tensor)_FSDPDeviceHandle!_named_parameters_with_duplicates_no_dispatch_record_stream_same_storage_as_data_ptr_set_fsdp_flattenedHandleTrainingState)_alloc_storage_free_storage	_p_assert)_ParameterMeta   )_ext_post_unflatten_transform_ext_pre_flatten_transformFlatParameterFlatParamHandleFlatParamShardMetadata	ParamInfoSharedParamInfoHandleShardingStrategyZFSDP_USE_UNSAFE_SETATTRZFSDP_SKIP_WRITEBACK_CHECKZFSDP_USE_FULL_PREC_IN_EVAL*   c                   @   s*   e Zd Ze Ze Ze Ze Ze ZdS )r'   N)	__name__
__module____qualname__r   
FULL_SHARDSHARD_GRAD_OPNO_SHARDHYBRID_SHARD_HYBRID_SHARD_ZERO2 r1   r1   b/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/torch/distributed/fsdp/flat_param.pyr'   d   s
   c                   @   s,   e Zd ZU dZeed< ejed< eed< dS )r%   z&Information for an original parameter.
param_namemodulemodule_nameNr)   r*   r+   __doc__str__annotations__nnModuler1   r1   r1   r2   r%   v   s   

c                   @   sF   e Zd ZU dZeed< ejed< eed< eed< ejed< eed< dS )	r&   ai  
    Additional information for a shared parameter.

    For each shared parameter, we designate one module and its parameter
    variable to be the primary owner, determined as the first one encountered
    in the parameter walk. These are prefixed with "prim". The primary module
    and parameter do not have their own :class:`SharedParamInfo` instance.
    r3   r4   r5   prim_param_nameprim_moduleprim_module_nameNr6   r1   r1   r1   r2   r&   ~   s   
	

c                   @   sJ   e Zd ZU dZeed< ee ed< ee ed< ee ed< ee ed< dS )_ShardParamInfoz4Shard-related information for an original parameter.in_shardoffset_in_shardnumel_in_shardintra_param_start_idxintra_param_end_idxN)r)   r*   r+   r7   boolr9   r   intr1   r1   r1   r2   r?      s   
r?   c                   @   s\   e Zd ZU dZeedf ed< eejdf ed< ee	df ed< eee	e	f df ed< dS )r$   a  
    This holds metadata specific to this rank's shard of the flat parameter.

    Attributes:
        param_names (Tuple[str, ...]): Prefixed parameter names of this rank's
            shard of the parameters; see :class:`FlatParameter`.
        param_shapes (Tuple[torch.Size, ...]): Parameter shapes of this rank's
            shard of the parameters; see :class:`FlatParameter`.
        param_numels (Tuple[int, ...]): Parameter numels of this rank's shard
            of the parameters; see :class:`FlatParameter`.
        param_offsets (Tuple[Tuple[int, int], ...]): [start, end] offsets (in
            units of numels) giving this rank's part of each flattened
            original parameter.
    .Zparam_namesZparam_shapesZparam_numelsparam_offsetsN)
r)   r*   r+   r7   r   r8   r9   torchSizerF   r1   r1   r1   r2   r$      s
   
c                   @   s   e Zd Zdd ZdS )_FlatParameterMetac                 C   s   t |tjot|ddS )N_is_flat_paramF)
isinstancerH   r   getattr)selfinstancer1   r1   r2   __instancecheck__   s
      z$_FlatParameterMeta.__instancecheck__N)r)   r*   r+   rP   r1   r1   r1   r2   rJ      s   rJ   c                   @   s  e Zd ZU dZejed< ejed< ejed< eed< ee	df ed< eejdf ed< ee
df ed	< eee df ed
< eedf ed< eedf ed< eedf ed< eedf ed< eej ed< eed< eed< eed< eed< eeef ed< eed< eed< eed< eeej  ed< eeej  ed< eeee   ed< eee  ed< ee ed< d$dd Zeee	 ee eej ee
 ee eee  eeej  eeej  ee dd!
d"d#ZdS )%r"   a  
    This is the flat parameter used by :class:`FullyShardedDataParallel`. It is
    comprised of one or more original parameters, which are flattened and
    concatenated to construct the flat parameter.

    Under the current design, this parameter logically represents both the
    unsharded and sharded flat parameter, and its data changes storages
    dynamically.
        - In the :class:`FullyShardedDataParallel` constructor, the parameter
        is initialized as unsharded and then sharded in-place.
        - At runtime, the parameter is lazily (re)-initialized. The sharded
        parameter data is saved in ``self._local_shard``, and a new ``Tensor``
        ``self._full_param_padded`` is created, which is the all-gather
        destination and owns the unsharded parameter storage thereafter. (See
        :meth:`FlatParamHandle.init_flat_param_attributes`.)
        - Throughout runtime, the parameter data changes storages as needed,
        e.g. to the sharded flat parameter, low precision sharded flat
        parameter, or the unsharded flat parameter.

    NOTE: Since ``use_orig_params=True`` supports intra-``FlatParameter``
    padding, we have two versions of the per-parameter numels, one that
    includes the padding (``_numels_with_padding``) and one that does not
    (``_numels``). The former may have length longer than the other data
    structures, while the latter has the same length as the number of actual
    original parameters like the other per-parameter data structures.

    NOTE: This is not a real class; instead, you will always get a Parameter
    back out if you try to create one of these.  This is similar to the trick
    we implemented for Parameter to get it to work with subclasses; this
    is primarily so that FlatParameter supports combination with FakeTensor.

    Attributes:
        _unpadded_unsharded_size (torch.Size): Unsharded flat parameter's size
            without right-hand-side padding for divisibility by the world size.
            For ``use_orig_params=True``, this includes alignment padding.
        _padded_unsharded_size (torch.Size): Unsharded flat parameter's size
            with right-hand-side padding for divisibility by the world size.
            For ``use_orig_params=True``, this includes alignment padding. This
            is only set for sharded strategies since they require padding for
            the all-gather.
        _sharded_size (torch.Size): Sharded flat parameter's size with padding.
            This is also set for ``NO_SHARD``, in which case it is the same as
            the unsharded sizes. (We omit "padded" because there is no
            analogous unpadded one.)

        _num_params (int): Number of original parameters flattened into this
            flat parameter. This is the length of the per-parameter data
            structures.
        _param_infos (Tuple[ParamInfo, ...]): Each parameter's parameter info
            entry; see :class:`ParamInfo` for details.
        _shapes (Tuple[torch.Size, ...]): Each parameter's original shape.
        _fqns (Tuple[str, ...]): Each parameter's fully-qualified name (FQN)
            prefixed from the ``_fully_sharded_module``. The names are
            guaranteed to be unique in the subtree rooted at that module.
        _param_extensions (Tuple[Optional[Any], ...]): Each parameter's
            extension (i.e. some per-parameter state) used to customize
            pre-flatten and post-unflatten behavior or ``None``. This is
            experimental, and users should not depend on its existence in the
            future.
        _numels_with_padding (Tuple[int, ...]): Each parameter's numel
            including entries for the padding. This is used to construct views
            into the flat parameter via ``torch.split()``. This may have length
            longer than ``_num_params``.
        _numels (Tuple[int, ...]): Each parameter's numel excluding entries for
            padding. This has length equal to ``_num_params``.
        _shard_param_infos (Tuple[_ShardParamInfo, ...]): Each parameter's
            shard parameter info; see :class:`_ShardParamInfo` for details.
        _shared_param_infos (Tuple[SharedParamInfo, ...]): Shared parameter
            info entries; see :class:`SharedParamInfo` for details.
        _modules (Set[nn.Module]): Modules that contain some original parameter
            that is flattened into the flat parameter.

        _shard_numel_padded (int): Numel padded for this rank's sharded flat
            parameter.
        _local_shard (Tensor): Sharded flat parameter with padding if using a
            sharded strategy. If using ``NO_SHARD``, then this is the unpadded
            unsharded flat parameter, and there is no notion of a sharded flat
            parameter or padded unsharded flat parameter.
        _full_param_padded (Tensor): Unsharded flat parameter with padding.
            This is not defined for ``NO_SHARD``. When using mixed precision
            for parameters, this has the low precision.
        _full_prec_full_param_padded (Tensor): Full precision unsharded flat
            parameter with padding. This is used for unsharding outside of
            computation when using mixed precision for parameters. This is
            never defined for ``NO_SHARD``.
        _post_backward_hook_state (Tuple[AccumulateGrad, RemovableHandle]):
            Flat parameter's :class:`AccumulateGrad` object and post-backward
            hook handle.
        _mp_shard (Tensor): Low precision sharded flat parameter with padding.
            This is only defined when parameter mixed precision is enabled. For
            ``NO_SHARD``, this is used for computation.
        _cpu_grad (Tensor): Sharded gradient with padding stored on CPU.
            This is only defined when offloading parameters is enabled.
        _saved_grad_shard (Tensor): Sharded gradient with padding from previous
            iterations for gradient accumulation without :meth:`no_sync`.

        _params (Optional[List[nn.Parameter]]): If ``use_orig_params=True``,
            then each original parameter variable; otherwise, ``None``. This
            does not include any padding tensors.
        _shared_params (Optional[List[nn.Parameter]]): The original shared
            parameter variables if ``use_orig_params=True`` and ``None``
            otherwise.
        _tensors (Optional[List[Optional[Tensor]]]): This saves the ``Tensor``
            views created in the forward and tracked by autograd when
            ``use_orig_params=True`` and is ``None`` otherwise. This is to
            preserve those ``Tensor`` variables for the backward to ensure that
            the ``FlatParameter`` 's ``AccumulateGrad`` object does not change
            in which case the post-backward hook does not run. This is relevant
            for cases like reentrant activation checkpointing.
        _is_grad_none_mask (Optional[List[bool]]): If ``use_orig_params=True``,
            a mask over the original parameters' gradients indicating if it is
            logically ``None`` or not; otherwise, ``None``. This does not
            include entries for padding. This mask is needed because only some
            of the parameters may have ``None`` gradient, in which case the
            flat gradient must be non-``None`` and must use zeros to
            approximate those original ``None`` gradients. This mask informs
            FSDP to set the original parameter gradients to ``None`` (instead
            of zeros) as needed.
    _unpadded_unsharded_size_padded_unsharded_size_sharded_size_num_params._param_infos_shapes_fqns_param_extensions_numels_with_padding_numels_shard_param_infos_shared_param_infos_modules_shard_numel_padded_local_shard_full_param_padded_full_prec_full_param_paddedZ_post_backward_hook_state	_mp_shard	_cpu_grad_saved_grad_shard_params_shared_params_tensors_is_grad_none_mask_is_padding_maskNTc                 C   s,   | t kstdtjtj||}d|_|S )Nz&subclasses FlatParameter not supportedT)r"   AssertionErrorr:   	Parameter__new__rK   )clsdatarequires_gradrr1   r1   r2   rl   T  s    zFlatParameter.__new__)
param_infosnumelsshapesfqnsshared_param_infosparam_extensionsparamsshared_paramsis_padding_maskreturnc                 C   s  t |t |kstt |t |ks(tt |t |ks<tt ||_||_||_||_||_|
|_g }t||
D ]\}}|sr|	| qrt
||_t
||_t |j|jkstt
||_dd |jD dd |jD |_|dk|	dkkst|dk	r|	dk	rt |	t |kstg |_t||
D ]\}}|s.|j	| q.|	|_t|j|jD ]}t| q`dd t|jD |_dd t|jD |_nd|_d|_d|_d|_| |_t| d|_dS )	a  
        Initializes attributes holding metadata about the original parameters
        comprising the flat parameter.

        We expose this method separate from the constructor to keep the
        constructor only responsible for the flat parameter's tensor data. This
        method should only be called once per model, while the constructor may
        be called multiple times, e.g. when reloading from a checkpoint, in
        which case only the tensor data needs to be passed to the constructor.
        Since :meth:`load_state_dict` is implemented via :meth:`copy_`, the
        metadata is correctly assumed to be unchanged.

        Args:
            See the Attributes in the class docstring.
        c                 S   s   h | ]
}|j qS r1   r4   .0pir1   r1   r2   	<setcomp>  s     z/FlatParameter._init_metadata.<locals>.<setcomp>c                 S   s   h | ]
}|j qS r1   r{   r}   Zspir1   r1   r2   r     s     Nc                 S   s   g | ]}d qS )Fr1   r}   _r1   r1   r2   
<listcomp>  s     z0FlatParameter._init_metadata.<locals>.<listcomp>c                 S   s   g | ]}d qS Nr1   r   r1   r1   r2   r     s     F)lenrj   rT   rU   rV   rW   rX   ri   zipappendtuplerZ   rY   r\   unionr]   re   rf   r   r   rangerh   rg   sizerQ   _post_backward_called)rm   rN   rq   rr   rs   rt   ru   rv   rw   rx   ry   Znumels_without_paddingnumel
is_paddingparamr1   r1   r2   _init_metadata]  sR    





zFlatParameter._init_metadata)NT)r)   r*   r+   r7   rH   rI   r9   rF   r   r%   r8   r   r   r?   r&   r   r:   r;   r   r   rk   rE   rl   classmethodr   r1   r1   r1   r2   r"      sP   
x



	
)	metaclassc                       s
  e Zd ZdZeeejef  ej	e
jeeee
j ee
j eejed
 fddZdd Zeddd	Zeeeejf  ej	eed
dddZeeeejf  edddZee eedddZee eeedddZee
j ee
j d
dddZe
 dd Zeeed
dddZ eeee!df d d!d"Z"e#eeeeeef d#d$d%Z$e#eeeeeef d#d&d'Z%e#eeee
j&d#d(d)Z'eeeef  d*d+d,Z(e)e*d*d-d.Z+e)e
 d
d*d/d0Z,ed*d1d2Z-d3d4 Z.d5d6 Z/ed*d7d8Z0d9d: Z1e
jd*d;d<Z2eed=d>d?Z3e
jd
d=d@dAZ4dBdC Z5dDdE Z6e
 dFdG Z7dHdI Z8dJdK Z9dLdM Z:e;j<dNdO Z=edPdQdRZ>dSdT Z?dUdV Z@d
d*dWdXZAe)dee
j eBe dYdZd[ZCe)dee ee dYd\d]ZDe)ed
d^d_d`ZEe)d
d*dadbZFe;j<eGd*dcddZHe)e
 d
d*dedfZIe)e
 d
d*dgdhZJe)e
 ed*didjZKee eee
j&eed
dkdldmZLdndo ZMdpdq ZNdrds ZOePej	 d*dtduZQeedYdvdwZReBeeSeSf  d*dxdyZTeBeeSeSf  d*dzd{ZUeVeeS d*d|d}ZWeVee d*d~dZXd
d*ddZYdd ZZedddZ[edddZ\e#edddZ]e#edddZ^dd Z_edddZ`edddZaeVed*ddZbeVed*ddZceVed*ddZdeVed*ddZeeVed*ddZf  ZgS )r#   a  
    This handle manages a flat parameter (:class:`FlatParameter`). This
    includes sharding and view management.

    Args:
        params (Sequence[nn.Parameter]): The parameters to flatten into the
            flat parameter.
        fully_sharded_module (nn.Module): See [Note: Fully Sharded Module].
        device (torch.device): The compute and communication device, which
            should be a non-CPU device. We refer to it as the compute device.
        sharding_strategy (ShardingStrategy): Sharding strategy to apply to
            this handle's ``FlatParameter``.
        offload_params (bool): Whether to offload the handle's
            ``FlatParameter`` to CPU.
        mp_param_dtype (Optional[torch.dtype]): Parameter mixed precision
            setting passed to the FSDP constructor.
        mp_reduce_dtype (Optional[torch.dtype]): Gradient reduction mixed
            precision setting passed to the FSDP constructor.
        keep_low_precision_grads (bool): Whether to keep gradients in low
            precision.
        use_orig_params (bool): If ``True``, then FSDP preserves the original
            parameter variables and returns them from ``named_parameters()``
            (e.g. to support different optimizer hyperparameters within one
            :class:`FlatParameter`). If ``False``, then FSDP reconstructs the
            parameters every iteration and returns the :class:`FlatParameter` s
            from ``named_parameters()``.
    )
rw   fully_sharded_moduledevicesharding_strategyoffload_paramsmp_param_dtypemp_reduce_dtypekeep_low_precision_gradsprocess_groupuse_orig_paramsc                    sr  t    t|}t|dkr2td| jj d|   tj	
tddk| _tj	
tddk| _| jrzttdt d |
}| | || _t| j| _|	| _|	 | _|	 | _|| _|| _|
| _|| _tj| _ t!" | _#|| _$d | _%d | _&d | _'d | _(d| _)d| _*d| _+|d j,| _-| .|| | j/d k	s8t0|rJt1| j/d	nd| _2| 3||| j2|
 | j4dd
 d S )Nr   zCannot construct a z with an empty parameter list 1zSince z=1, FSDP will not check for parameter or gradient writeback. Changing parameter or gradient storages may lead to silent correctness errors.Funsharded_dtype	as_params)5super__init__listr   
ValueError	__class__r)   _init_setattr_fnsosenvironget_FSDP_SKIP_WRITEBACK_CHECK_skip_writeback_check_FSDP_USE_FULL_PREC_IN_EVAL_use_full_prec_in_eval_warn_skip_writeback_checklog_init_get_unflat_views_fnr   r   Zfrom_device_device_handler   rankr   
world_size_sharding_strategy_offload_params_use_orig_params_keep_low_precision_gradsr   IDLE_training_statedistZget_debug_level_debug_level_fully_sharded_module'_unsharded_flat_param_for_skipped_viewsZ_handle_indexZ_pre_forward_order_indexZ_post_forward_indexZ_needs_pre_forward_unshardZ_needs_pre_backward_unshardZ_prefetcheddtype_orig_param_dtype_init_param_reduce_dtypes_fwd_bwd_param_dtyperj   _get_aligned_numelZ_aligned_numel_init_flat_param_and_metadata_use_unsharded_views)rN   rw   r   r   r   r   r   r   r   r   r   align_addressesr   r1   r2   r     sf    





   zFlatParamHandle.__init__c                 C   s<   t jtddk}|  |  |r,t| _t| _nt| _t| _d S )Nr   r   )	r   r   r   _FSDP_USE_UNSAFE_SETATTR_unsafe_setattr_tensor_setattr_tensor_unsafe_setattr_param_setattr_param_safe_setattr_tensor_or_param)rN   Zuse_unsafe_setattrr1   r1   r2   r     s    z!FlatParamHandle._init_setattr_fns)r   c                 C   s   |r
| j n| j| _d S r   )_get_unflat_views_aligned_get_unflat_views_unaligned_get_unflat_views)rN   r   r1   r1   r2   r   )  s    z)FlatParamHandle._init_get_unflat_views_fnN)rw   r4   aligned_numelr   rz   c           !      C   s  t |dkrtd|dk r*td| | |\}}}t|}g }	g }
g }g }g }i }g }g }g }g }d }}|jddD ]j\}}t|ddD ]R\}}||krq||kr|| \}}}|| |t|||||| q|dkr@|||  }|dkr@||k r@t||d|}|| |d |
| ||7 }t	|\}}t
tj|}|| |||f||< || |d |	t||| |
|  ||j |r|d | n|} ||  || 7 }|| 7 }qq~t |dkrtd	| d
| | jdkrB|dkrB||krBtd|| || |dkr| j|| j  }|dkr|| jk r| jdkrtd| t||d|}|| |d |
| ||7 }| j|d|d| _t| j|	|
|||||rt|nd|rt|nd|
 dS )af  
        NOTE: This should only be called once at construction time, after which
        the ``FlatParameter`` metadata is assumed to be static.

        NOTE: The elements of ``params`` should only be ``Tensor`` s when
        composing with ``DTensor`` -based tensor parallelism, in which case the
        elements may be ``DTensor`` local shards.
        r   zExpects non-empty `params`-Expects non-negative `aligned_numel` but got F)Zremove_duplicate)recurseT.z2`params` were not found in `module`'s treeparams: z	
module: zLFSDP FlatParameter address alignment created %s numel of padding (%s vs. %s)zFFSDP FlatParameter world size divisibility created %s numel of padding)r   ro   N)r   r   _validate_tensors_to_flattensetZnamed_modulesr   r   r&   _construct_padding_tensorr!   r   r:   rk   r%   r   shaper   r   infor   flatten_tensors_into_flat_param
flat_paramr"   r   _convert_to_params)!rN   rw   r4   r   r   r   flat_param_requires_gradr   Z
params_setrq   rr   rs   rt   ru   Zshared_param_memoZparams_to_flattenrx   rv   ry   total_numelZtotal_numel_without_paddingZsubmodule_name	submoduler3   r   r=   r>   r<   numel_to_padpadding_tensorZtransform_t	extensionfqnr1   r1   r2   r   0  s     


   







   


z-FlatParamHandle._init_flat_param_and_metadatatensorsrz   c                 C   s   d}d}d}|D ]}t |tr&td|dkr>| s>td|dk	rf|j|krftd| d|j | js|dk	r|j|krtd|dk	r|j|krtd| d|j |j}|p|j}|j}q|dk	std|||fS )	zV
        Validates the tensors to flatten and returns any necessary metadata.
        Nz Cannot flatten a `FlatParameter`z$Cannot flatten integer dtype tensorsz0Must flatten tensors with uniform dtype but got z and zNMust flatten tensors with uniform `requires_grad` when `use_orig_params=False`z5Must flatten tensors on the same device but got both z!Requires non-empty `tensors` list)	rL   r"   r   Zis_floating_pointr   r   ro   r   rj   )rN   r   r   r   r   tensorr1   r1   r2   r     s<    

z,FlatParamHandle._validate_tensors_to_flatten)r   r   rz   c                 C   s  t |dkrtd|dk r*td| | |\}}}g }|dkrd}|D ]`}|||  }	|	dkr|	|k rt|	|d|}
||
 ||	7 }|tt| || 7 }qN| j	|| j	  }	|	dkr|	| j	k rt|	|d|}
||
 ||	7 }ndd |D }tj
|ddS )a	  
        Flattens ``tensors`` into a single flat tensor optionally including
        padding if ``aligned_numel`` is greater than 0, where ``aligned_numel``
        gives the numel required to have address alignment.

        NOTE: The padding alignment algorithm must be kept in sync with
        :meth:`_init_flat_param_metadata`. We separate the two methods because
        the initialization happens once, whereas this method may be called
        multiple times throughout training (e.g. for checkpointing).
        r   zExpects non-empty `tensors`r   Fc                 S   s   g | ]}t t|qS r1   )rH   flatten_detach_if_needed)r}   r   r1   r1   r2   r     s    z3FlatParamHandle.flatten_tensors.<locals>.<listcomp>dim)r   r   r   r   r   rH   r   r   r   r   cat)rN   r   r   r   r   r   Zflat_tensorsr   r   r   r   r1   r1   r2   flatten_tensors  sJ       
   

zFlatParamHandle.flatten_tensors)r   r   ro   rz   c                 C   s   |  ||}t||dS )N)ro   )r   r"   )rN   r   r   ro   Zflat_param_datar1   r1   r2   r     s    z/FlatParamHandle.flatten_tensors_into_flat_param)r   r   rz   c                 C   sh   |dk	| _ |dk	| _| j r0| js0|| _| j| _n|p8| j| _|pD| j| _| jdk	sVt| jdk	sdtdS )a0  
        Precondition: ``self.flat_param`` is set. This ensures that this
        handle's parameters have a single dtype.

        Postcondition: This sets ``self._fwd_bwd_param_dtype`` and
        ``self._reduce_dtype``. If ``mp_param_dtype`` or ``mp_reduce_dtype``
        is ``None``, then we assume the original parameter dtype. One special
        case is if ``mp_param_dtype`` is not ``None`` and ``mp_reduce_dtype``
        is ``None``, in which case we assume the gradient reduction dtype
        matches the forward/backward parameter dtype.
        N)_low_prec_param_dtype_specified _low_prec_reduce_dtype_specifiedr   _reduce_dtyper   rj   )rN   r   r   r1   r1   r2   r   #  s    


z)FlatParamHandle._init_param_reduce_dtypesc                 C   s   | j }| js$| dd| d  nt| dkd | }t|| j	| j
\}}|| | | j	 }| | j	d  d }| ||| | dkr|d | jr|   dS )aE  
        Shards the handle's ``FlatParameter``. This allocates new memory for
        the sharded flat parameter and frees the unsharded flat parameter's
        storage.

        Postcondition: ``self.flat_param`` is the sharded flat parameter. Shard
        metadata attributes are set for all sharding strategies.
        r   r   z;The `FlatParameter` is not the sole occupant of its storageN)r   uses_sharded_strategy_init_shard_metadatar   r   Zstorage_offset_typed_storager#   
_get_shardr   r   set__sizeZ_resize_r   _use_sharded_views)rN   r   Zorig_storagesharded_flat_paramnumel_paddedZ	start_idxZend_idxr1   r1   r2   shardG  s*    

  

zFlatParamHandle.shard)r   unsharded_start_idxunsharded_end_idxrz   c                 C   s   | j }| |_| }t|dko(||kd| d|  t||kd| d|  | ||}t||jkstd|j dt| ||_	||_
dS )	a  
        Initializes shard-related metadata for this rank's shard of the flat
        parameter: ``_sharded_size``, ``_shard_param_infos``, and
        ``_shard_numel_padded``.

        Args:
            numel_padded (int): Numel padded for this rank's sharded flat
                parameter.
            unsharded_start_idx (int): Start index in the unsharded flat
            parameter assigned to this rank.
            unsharded_end_idx (int): End index (inclusive) in the unsharded
                flat parameter assigned to this rank.

        Precondition: ``self.flat_param`` 's data is the sharded flat
        parameter.
        r   zunsharded_start_idx: z unsharded_end_idx: znumel_padded: z sharded_flat_param_numel: zExpects length 	 but got N)r   r   rS   r   r   _get_shard_metadatar   rT   rj   r[   r^   )rN   r   r   r   r   sharded_flat_param_numelshard_param_infosr1   r1   r2   r   f  s*    
 z$FlatParamHandle._init_shard_metadata.)r   r   rz   c                 C   s   |   }t|t| jjks<tdt| jj dt| g }|| d }tt|| jjD ]\}\\}}}	|	rtq^||ko||k}
|
stddddd}nr||krd}|| }n|| }d}|dkr||k std| d| d	t	||| }|| d }td
||||}|
| q^t|S )z
        Computes the shard metadata based on ``unsharded_start_idx`` and
        ``unsharded_end_idx`` (inclusive), which give the interval of the
        unsharded flat parameter specifying the shard.
        z	Expected r   r   FNr   zInvalid `offset_in_shard` of z! for sharded flat parameter with z numelT)_get_flat_param_offsetsr   r   rY   rj   	enumerater   ri   r?   minr   r   )rN   r   r   Zflat_param_offsetsr   r   iZunsharded_param_start_idxZunsharded_param_end_idxr   in_sharded_flat_paramshard_param_inforC   rA   rD   rB   r1   r1   r2   r     sZ    


z#FlatParamHandle._get_shard_metadata)r   r   r   rz   c                 C   sd   t | |}t||d k r0|d d}n|| }|d  |  }|dks\td||fS )aa  
        Returns the shard of ``tensor`` without any padding for the given
        ``rank`` and ``world_size`` and the numel to pad for that shard.

        If ``tensor`` is already flattened or may be viewed in the flattened
        shape (which is true in the expected usage), then this method does not
        allocate any new tensor memory.
        r   r   z5Chunk's size should be at most the first chunk's size)rH   r   chunkr   Z	new_emptyr   rj   )r   r   r   chunksr  r   r1   r1   r2   _get_unpadded_shard  s    z#FlatParamHandle._get_unpadded_shardc                 C   s:   t | ||\}}| }|dkr2t|d|g}||fS )a(  
        Returns the shard of ``tensor`` with padding for the given ``rank`` and
        ``world_size`` and the numel padded for that shard.

        This method allocates new memory (via :meth:`clone`) since the
        unsharded ``tensor`` may be deallocated after this method returns.
        r   )r#   r  cloneFpad)r   r   r   r  r   r   r1   r1   r2   r     s      zFlatParamHandle._get_shardc                 C   s^   t | jdkst| j t| ||\}}| }t |dksJt| t|d | gS )z
        Returns the shape of ``tensor`` after sharding including padding. This
        requires ``tensor`` to have 1D shape and ensures that the returned
        shape is 1D.
        r   r   )r   r   rj   r#   r  r   rH   rI   )r   r   r   Zunpadded_sharded_tensorr   Zunpadded_sharded_sizer1   r1   r2   _get_sharded_size  s      z!FlatParamHandle._get_sharded_size)rz   c                 C   sB   t t| jj}dg|dd  }dd |D }t t||}|S )z
        Returns [start, end] offsets of each original parameter's flattened
        data in the unsharded flat parameter (without padding).
        NOTE: The returned list includes elements for alignment padding.
        r   Nc                 S   s   g | ]}|d  qS )r   r1   )r}   endr1   r1   r2   r     s     z;FlatParamHandle._get_flat_param_offsets.<locals>.<listcomp>)r   r   r   rY   r   )rN   Zcumulative_sumZstartsZendsrG   r1   r1   r2   r     s
    z'FlatParamHandle._get_flat_param_offsetsc           	      C   s   g }g }g }g }t | jj| jj| jj| jjD ]D\}}}}|jsBq.|| || || ||j|j	f q.t
t|t|t||S )z
        Returns shard-related metadata specific to this rank's shard of the
        flat parameter.
        NOTE: The returned tuple does not include elements for alignment
        padding but does account for the padding.
        )r   r   rW   rV   rZ   r[   r@   r   rC   rD   r$   r   )	rN   Z	fqns_listZshapes_listZnumels_listZshard_param_offsetsr   r   r   r  r1   r1   r2   shard_metadata  s4    



zFlatParamHandle.shard_metadatac                 C   sH  | j }|j| jkr<| js |j| _| js4| js4|j| _|j| _td}| j	rft
|j|kd|j  n| | j  |j|_| j	r|j |_tj|j|d |_| jrtj|j| j| jd|_t|j | jrD| jr| jn|j}| | j }tj|| j|d|_|j |_t|j | jrDtj|| j|jd|_t|j dS )a  
        This initializes some attributes on the handle's ``FlatParameter``.
        This should be called during lazy initialization since it requires the
        parameter to be on the compute device if not offloading to CPU and we
        want to give users the chance to move the parameter appropriately after
        the FSDP constructor.

        For each tensor attribute on the ``FlatParameter``, see the unshard and
        reshard methods in this class for the allocation and free pattern.
        cpuzWExpects the `FlatParameter` to be on CPU when parameter CPU offloading is enabled, not r   )r   r   N)r   r   r   r   r   r   r   rH   r   r   r   _check_on_compute_devicern   r_   Z
pin_memory
zeros_likerc   _uses_param_mixed_precision
empty_likerb   r   r   r   r   emptyr`   r   rR   ra   )rN   r   Z
cpu_deviceZunsharded_param_dtypeZpadded_unsharded_numelr1   r1   r2   init_flat_param_attributes@  sf    

 

z*FlatParamHandle.init_flat_param_attributesc                 C   s   | j tjkr| jr|   d}| jr2| js2|  }| jrH| j	sH| 
 sHnB| jrb| jsb|   d}n(| j	r| jj| jkr| j| jdd d}| | j |S )a$  
        Returns: ``False`` if this is a no-op and ``True`` otherwise.

        Postcondition: ``self.flat_param`` 's data is on the device for
        communication and is what should be all-gathered. This means that it
        matches the dtype of the expected unsharded parameter.
        FTZnon_blocking)r   r   SUMMON_FULL_PARAMS_skipped_use_sharded_viewsr   r   r   _writeback_orig_paramsr   r   needs_unshardr  _force_full_precision_use_low_precision_shardr   r   flat_param_tor  )rN   retr1   r1   r2   pre_unshard  s.    	
zFlatParamHandle.pre_unshardc                 C   sF   |    | j}t|j|j  |j|jj| jdd |j|_	dS )z
        Allocates the low precision shard directly on the compute device and
        switches to using the low precision sharded flat parameter.
        Tr  N)
_check_low_precision_shardr   r   rb   r_   r   copy_tor   rn   )rN   r   r1   r1   r2   r    s      z(FlatParamHandle._use_low_precision_shardc                 C   sJ   |   s*| jr|  n| j}| | dS |  }| |}| | dS )a  
        Runs the unshard logic. This includes all-gathering the flat parameter
        and switching to using the unsharded flat parameter. If the handle does
        not need unsharding, then this only switches to using the unsharded
        flat parameter. For ``NO_SHARD``, this is a no-op.

        If FSDP is in :meth:`summon_full_params` and the handle uses parameter
        mixed precision, then the parameter is forced to full precision.
        N)r  r    _get_padded_unsharded_flat_paramr   _use_unsharded_flat_param"_alloc_padded_unsharded_flat_param_all_gather_flat_param)rN   unsharded_flat_parampadded_unsharded_flat_paramr1   r1   r2   unshard  s    



zFlatParamHandle.unshardc                 C   s,   | j s
dS |  }|  | k}| S )z=Returns if the handle's flat parameter needs to be unsharded.F)r   r%  r   r   r   )rN   r)  Zalready_unshardedr1   r1   r2   r    s    
zFlatParamHandle.needs_unshardc                 C   s0   |    | j}|  }| | t||j |S )a   
        Allocates the *padded* unsharded flat parameter. The unpadded unsharded
        flat parameter is always a view into the padded one. This padded
        parameter is saved to a different attribute on the ``FlatParameter``
        depending on if we force full precision.
        )_check_sharded_strategyr   r%  _check_storage_freedr   rR   rN   r   r)  r1   r1   r2   r'    s    
z2FlatParamHandle._alloc_padded_unsharded_flat_paramc                 C   sb   |    | j}| jrX| jrX|j}t|j| jkd| j  |j	 
 dkr^t|j n|j}|S )z
        Returns a reference to the padded unsharded flat parameter depending on
        the calling context. This should only be called if using a sharded
        strategy.
        zExpects full precision but got r   )r,  r   r  r  ra   r   r   r   r`   untyped_storager   r   r.  r1   r1   r2   r%    s    


z0FlatParamHandle._get_padded_unsharded_flat_param)r*  rz   c                 C   s   t t| dot| dd | jj}| | j }t | |kd| d|   |jrtt	|t
| j}t
j||| jd}nt
||| j |S )z
        All-gathers the handle's flat parameter to the destination
        ``padded_unsharded_flat_param``, and switches to using the all-gathered
        tensor.
        r   r   zEExpects a process group and world size to have been set via `shard()`zExpects z numel but got group)r   hasattrr   rn   r   r   Zis_cpur   rH   r  r   Zget_world_sizer   Z
all_gatherall_gather_into_tensor)rN   r*  r   Zexpected_numelZtensor_listZworkr1   r1   r2   r(    s8    	
 
  z&FlatParamHandle._all_gather_flat_paramc                 C   sx   | j j}|d|  || j _| jtjk}| jtjk}| j	rd| j
rN|rNdS | j| o\| d n|rt| jdd dS )z
        Switches to using the *unpadded* unsharded flat parameter, which is a
        view into the *padded* unsharded flat parameter.
        Nr   F)r   rQ   r   viewrn   r   r   FORWARDBACKWARD_PREr   r  r   )rN   r*  unsharded_size
in_forwardZin_pre_backwardr1   r1   r2   r&  B  s     

z)FlatParamHandle._use_unsharded_flat_paramc                 C   s$   | j r| jr|   | | j dS )zo
        Runs the post-unshard logic. This includes freeing the low precision
        shard if needed.
        N)r  r   !_free_low_precision_sharded_paramr  r   rN   r1   r1   r2   post_unshardb  s    zFlatParamHandle.post_unshardc                 C   s,   |    t| jj| j  t| jj dS )z/Frees the low precision sharded flat parameter.N)r"  r   r   rb   r   current_streamr   r:  r1   r1   r2   r9  k  s     z1FlatParamHandle._free_low_precision_sharded_paramc                 C   s"  | j s|   dS | j}| | tjdtj| jd}|jdk|d< t	j
|| jd |d | jkrtd|_|   dS tj|j| jd}|jdkr| jt	jjkrtd| j d d|_tj|j| jd}n| |j |j|_|j}t	||| j | jj}|d|  ||_|   dS )	a  
        Unshards the handle's ``FlatParameter`` 's gradient. If all ranks have
        ``None`` gradient, then all original parameters will as well. This
        method performs an all-reduce and an all-gather. The additional
        all-reduce is tolerable since this method is not meant to be used on
        the computation critical path.

        Postcondition: ``_saved_grad_shard`` is defined and contains the value
        to set ``flat_param.grad`` after gradients are resharded.
        Nr   )r   r   r   r0  r  [Rank z] Only some but not all ranks have a `None` `FlatParameter` gradient, so FSDP is using zeros to approximate those ranks' sharded gradients being `None`)r   _use_unsharded_grad_viewsr   _check_unshardedrH   ZzerosZint32r   gradr   Z
all_reducer   r   rd   r  rR   r   
DebugLevelINFOwarningswarnr   rS   _check_shardedr3  rQ   r   r4  )rN   r   Znum_grad_noneZpadded_unsharded_gradsharded_gradr7  r1   r1   r2   unshard_gradz  sH    

  zFlatParamHandle.unshard_gradc                 C   s4   | j r|   | jsd S | jj| j_t| jd d S )Nrd   )r   _use_sharded_grad_viewsr   r   rd   r@  delattrr:  r1   r1   r2   reshard_grad  s    zFlatParamHandle.reshard_gradc                 C   s(  t | jtjtjfkd | j}|jdk	r$|j |jksJ|jj	|j	kr$| 
| j |jj	| j	k}t | pp| jd| j	 d|jj	  |j |j k}|r|s|jj|_|j}nt t|dd |j}|jj}| jr|j|kr|||_n,|j}t |j |kd| d|j   d|_dS )	z
        Prepares the gradient for the backward computation by saving and
        clearing any existing sharded gradient in ``.grad`` to enable computing
        a new unsharded gradient.
        z:Expects to be in `BACKWARD_PRE` or `IDLE` (if prefetching)Nz&Expects the sharded gradient to be on r   rc   z7`_cpu_grad` should be defined if the gradient is on CPUzFExpects `.grad` to be the unsharded gradient in `no_sync()` with size z but got size )r   r   r   r6  r   r   r@  r   rQ   r   r  r   r_   rn   rd   r2  rc   r   r   r$  rR   )rN   r   Zgrad_offloadedZprev_iter_synced_gradientsrF  Zlocal_shard_dtypeZpadded_unsharded_sizer1   r1   r2   prepare_gradient_for_backward  sV    


z-FlatParamHandle.prepare_gradient_for_backwardc                    s    fdd} j }t|drB |  | |j|_|| nlt|dr |  | |jdk	rv |j |jr|j|_|jdk	r|| nt	 j
 p|j d t|drt|d dS )z
        Prepares the gradient for optimizer computation by moving the sharded
        gradient to the ``.grad`` attribute.
        c                    sN    j sJ jrJt| jd k	d | jj jkrJ| j j| j_ jrJ 	  d S )NzUnexpected None grad!)
r  r   r   r@  r   r   r$  rn   r   rH  )r   r:  r1   r2   "cast_grad_to_param_dtype_if_needed   s    zVFlatParamHandle.prepare_gradient_for_optim.<locals>.cast_grad_to_param_dtype_if_neededrc   rd   NzcAll sharded parameters that received a gradient in the post-backward should use `_saved_grad_shard`)r   r2  rE  _check_on_cpurc   r@  r  rd   r   r   r   rI  )rN   rL  r   r1   r:  r2   prepare_gradient_for_optim  s.    	










z*FlatParamHandle.prepare_gradient_for_optimc                 c   s   |    t| j | jjkd| jj d| j   | | j | j  }|    }t||kd | 	t
d |   z
dV  W 5 t| j | jjkd| jj d| j   |  }|d| j  | j | | X dS )a[  
        Moves the unpadded unsharded flat parameter to CPU while in the context
        and moves it back to the previous device upon exit. For now, this
        assumes the ``FlatParameter`` is the unpadded unsharded flat parameter
        since (1) there is no reason to include the padding in the copy and (2)
        there is no use case for the sharded flat parameter.

        Precondition: ``self.flat_param`` 's data is the unpadded unsharded
        flat parameter on the compute device, and the handle uses a sharded
        strategy.
        Postcondition: Same as the precondition.
        zExpects size r   zEExpects the unpadded parameter to be a view into the padded parameterr  N)r,  r   r   r   rQ   r  r   Z	_data_ptrr%  r  rH   r   _free_unsharded_flat_paramr'  r   r#  r&  )rN   Zunpadded_storage_ptrZpadded_storage_ptrr*  r1   r1   r2   to_cpu)  s4    
zFlatParamHandle.to_cpu)free_unsharded_flat_paramc                 C   s   |    |r|   dS )av  
        Runs the reshard logic. This includes freeing the unsharded flat
        parameter if ``free_unsharded_flat_param`` and switching to using the
        sharded flat parameter. Note that this also implicitly offloads
        the sharded flat parameter (if CPU offload is enabled) by pointing
        it to the ``_local_shard`` attribute which resides on CPU.
        N)_use_sharded_flat_paramrO  )rN   rQ  r1   r1   r2   reshardY  s    zFlatParamHandle.reshardc                 C   s   | j r| js| js|   dS )a;  
        Runs the post-reshard logic. This includes freeing any memory that
        can now be freed given that the ``FlatParameter`` points to the full
        precision sharded flat parameter.

        Precondition: ``self.flat_param`` 's data points to the full precision
        sharded flat parameter.
        N)r  r   r  r9  r:  r1   r1   r2   post_reshardi  s    zFlatParamHandle.post_reshardc                 C   s@   |    |  }| | | | t|| j  t| dS )z
        Frees the padded unsharded flat parameter. The tensor to free depends
        on the calling context since the unshard may have forced full
        precision, in which case a different tensor is used.
        N)r,  r%  _check_storage_allocatedr  r   r   r<  r   )rN   r)  r1   r1   r2   rO  |  s    

 z*FlatParamHandle._free_unsharded_flat_paramc                 C   s   | j }| jr8| jtjk}t o,|o,| jtk}|r8|j	}| j
r`|jj}t|tdkd|  |j|_	| jr|rz|| _n|   |r| js|jdk	o| jo|jj|jk}|r|   n|   dS )z-Switches to using the sharded flat parameter.r  z-Expects the local shard to be on CPU but got N)r   r   r   r   r5  rH   Zis_grad_enabledr   *NO_RESHARD_AFTER_FORWARD_HANDLE_STRATEGIESrn   r   r_   r   r   r   r   r  r@  r   r   rQ   r>  rH  )rN   r   r8  Zskip_use_sharded_viewsr)  r   Zaccumulated_grad_in_no_syncr1   r1   r2   rR    sF    


z'FlatParamHandle._use_sharded_flat_param)r   rz   c                 C   s>   | j }|dkr|}dd ttj||jdd|j|jD }|S )a2  
        Returns unflattened ``Tensor`` views into ``tensor`` if it is not
        ``None`` or ``flat_param`` otherwise, where the unflattening is based
        on ``flat_param`` 's metadata.

        Examples for ``tensor`` include ``flat_param.grad`` or unsharded
        tensor optimizer state.
        Nc                 s   s$   | ]\}}}t |||V  qd S r   )r    r4  )r}   Z	subtensorr   Zparam_extensionr1   r1   r2   	<genexpr>  s   z>FlatParamHandle._get_unflat_views_unaligned.<locals>.<genexpr>r   r   )r   r   rH   splitrZ   rV   rX   )rN   r   r   viewsr1   r1   r2   r     s    z+FlatParamHandle._get_unflat_views_unalignedc                 C   sv   | j }|dkr|}tj||jdd}d}g }t||jD ]8\}}|rFq8|t||j	| |j
|  |d7 }q8|S )z
        This has the same contract as :meth:`_get_unflat_views_unaligned`
        except it checks for ``None`` placeholders representing padding for
        alignment, which may incur slightly more CPU overhead.
        Nr   r   r   )r   rH   rX  rY   r   ri   r   r    r4  rV   rX   )rN   r   r   ZsplitsidxrY  rX  r   r1   r1   r2   r     s*    
  
z)FlatParamHandle._get_unflat_views_aligned)r   rz   c                 C   s  | j }| | |  }ddlm} tt||jD ]\}\}\}}}	| jr|rt	||krr| 
||t| q4| j j| }
| 
|||
 ||
_q4|r| 
||t| q4|}| jr| jtjkr|| j j|< n"| jtjkr| j j| }||_|}| ||| | jr4| jtjkr4||j|< q4t| j jD ]\}\}}}	}}}	t||}t| p^t|tjd| dt	|  | jr|r| j j| }| 
||| ||_nD|r| 
||| n.| ||| | jr,| jtjkr,||j|< q,dS )a  
        Unflattens the unsharded flat parameter by setting the original
        parameter variables to be views into it.

        Args:
            as_params (bool): If ``True``, then registers the original
                parameters as ``nn.Parameter`` s; if ``False``, then registers
                the original parameters only as ``Tensor`` s. ``False`` should
                be used during forward/backward computation and when hiding the
                original parameters from :meth:`nn.Module.named_parameters`.
        r   )DTensorz
as_params=z type(prim_param)=N)r   r?  r   Ztorch.distributed._tensorr[  r  r   rU   r   typer   r:   rk   re   rn   r   r   r5  rg   r6  r   _parametersr\   rM   r   rL   rf   )rN   r   r   rY  r[  r  r4  r3   r4   r   r   Z	param_varr   r<   r=   
prim_paramZshared_paramr1   r1   r2   r     st    





 
z$FlatParamHandle._use_unsharded_viewsc                 C   s  | j jdkr.t| j j| j jD ]
}d|_qdS | | j j | | j j}tt|| j j	D ]\}\}\}}}t
t||| j j|  d t||}|j|jks|j|jks|j|jkr|jdkrt||_||j_q\||_q\t| j jD ]\}\}}}}	}
}t
t|||r|d | n| d t||}t|
|	}|j|jjksn|j|jjksn|j|jjkr|jdkrt||_|j|j_q|j|_qdS )z
        Unflattens the unsharded flat parameter's gradient by setting the
        original parameter variables' gradients to be views into it.
        Nz is missingr   )r   r@  r   re   rf   r?  r   r  r   rU   r   r2  rW   rM   r   r   r   rH   r  rn   r\   )rN   r   rY  r  r4  r3   r4   r   r5   r<   r=   r^  r1   r1   r2   r>  I  s`    









z)FlatParamHandle._use_unsharded_grad_viewsc              	   c   s*   | j dd z
dV  W 5 | j dd X dS )a"  
        Assumes the flat parameter is unsharded. When in the context,
        unflattens the original parameters as ``nn.Parameter`` views into the
        flat parameter, and after the context, restores the original parameters
        as ``Tensor`` views into the flat parameter.
        Tr   FN)r   r:  r1   r1   r2   unflatten_as_params  s    
z#FlatParamHandle.unflatten_as_paramsc                 C   s8  d| _ | js| jdd dS | j}| | tjd| jj| jjdd}t	|j
|j|jD ]J\}}\}}}| ||| |js||_qX|j}|j}	||||	  |_qX| jjdk	sttt	| jj| jjD ]6\}
\}\}}}}}}| ||| t||}||_q| jtjkr4tt| jjD ]}
d| jj|
< q dS )a5  
        Sets the original parameter variables' data to be flattened views into
        the sharded flat parameter.

        The views are kept as flattened to simplify the case where a parameter
        is sharded across ranks. Parameters whose data is not present in the
        sharded flat parameter have their data set to a size-0 empty tensor. We
        do not delete them to ensure to preserve expected behaviors like model
        printability. Parameters whose data is present must preserve their
        variables to be passable to an optimizer.
        NTr   r   F)r   r   ro   )r   r   r   r   rE  rH   r  r   r   r   re   r[   rU   r   r@   rn   rA   rB   rf   rj   r  r\   rM   r   r   BACKWARD_POSTr   r   rg   )rN   r   Zsize_0_empty_tensorr   r  r3   r4   r   offsetrB   r  r<   r=   r^  r1   r1   r2   r     sH    
  

z"FlatParamHandle._use_sharded_viewsc                 C   sb  | j }| | | j}|dkr<t|j|jD ]
}d|_q,dS | | t|j|j|j	D ]\}}}|j
spd|_qX|j}|jr|s|j}| js|j|jkr|jdkrt||_||||  |j|j_q||||  |j|_qXd|_qX|jdk	sttt|j|jD ]L\}\}\}	}	}	}
}}	t||
}|rT|jrTt||
}|j|_nd|_qdS )a%  
        Sets the original parameter variables' gradients to be flattened
        views into the sharded flat parameter's gradient. This is a no-op if
        there is no gradient.

        Parameters whose data is not present in the sharded flat parameter and
        parameters with ``requires_grad=False`` have their gradients set to
        ``None``. Since the gradient variables do not need to be preserved,
        this method does not manipulate existing ``Tensor`` data directly and
        creates new ``Tensor`` variables instead.
        N)r   rE  rF  r   re   rf   r@  r   r[   rh   r@   rB   ro   rA   r   r   rH   r  Zreshaper   rn   rj   r  r\   r2  rM   )rN   r   r@  r   r  Zis_grad_nonerB   ra  r  r   r<   r=   r  r^  r1   r1   r2   rH    sP    



 



z'FlatParamHandle._use_sharded_grad_viewsc              	   C   s  | j r| | js| jsdS | j}d}| jrP| j rP| j  }t|dkd n|  }| j sh| jsn|j	n|j
}|dkrdn
|  }tt|j|j|jD ]\}\}\}}	}
}}\}}}|sqt||sq| jr|j| }t|dk	d|j|   t|||k	}|pt|| }| jrB|s2|rBtd| j |r\t||}||j|< |rt|
g}| |||||	d d}| jrq|j	dkr|j	dk	rt|
g}| d|j	|||	d q|j	dk	r| j s| jrq|dkpt|j	| }|r|dkrt|}t|
g}| |j	||||	d ||_	|j	}|  }qt|jD ]4\}\}}}}}}t||t||k	r^tdq^|S )	a  
        Iterates over the original parameters and writes back any parameters
        that changed storages (due to a non-inplace operator) to the handle's
        ``FlatParameter``. This method preserves the ``FlatParameter` 's
        device even if an original parameter's device changes.

        Raises:
            RuntimeError: If an original parameter or gradient changes storages
            but no longer has the expected flattened shape.
        Returns: ``True`` if some writeback happened, and ``False`` otherwise.
        Fr   zPIf skipped using sharded views, the unsharded flat parameter should be allocatedNz!Expects to have saved tensor for zOFSDP does not support changing the parameters between forward and backward for Tz/Changing shared parameters is not supported yet)r   
is_shardedr   r  r   r/  Zdata_ptrr   r   r@  rc   r  r   re   r[   rU   r2  rg   rW   rM   r   rj   r   rH   rI   _writeback_tensorr  r\   NotImplementedError)rN   r   Z	wrotebackZflat_param_data_ptrZflat_param_gradZflat_param_grad_data_ptrr  r   r@   rA   rB   r   r3   r4   Zparam_changedZneeds_param_writebackexpected_shapeZneeds_grad_writebackr<   r=   r1   r1   r2   r    s    

	

 


          

 



z&FlatParamHandle._writeback_orig_params)
src_tensor
dst_tensortensor_indexre  ra  is_paramrz   c           
      C   s6  t t|dkd|  | jtjjkrt| dr6| jnt }|dk	rL|j	nd}|dk	r^|j
nd}	td| d|rvdnd d	| j d
| d| d|j
 d|	  |dk	r|j	|krtd|rdnd d| d|j	 |dk	r||||   | n6||||     | jjdk	s&td| jj|< dS )a#  
        Writes back ``src_tensor`` to ``dst_tensor`` at offset ``offset``,
        where ``src_tensor`` should have shape ``expected_shape``. ``is_param``
        indicates if the tensor is the parameter (if ``True``) or gradient (if
        ``False``). If ``src_tensor`` is ``None``, then the effect is zeroing
        instead of copying. ``tensor_index`` gives the index of ``src_tensor``
        in the metadata structures.

        Raises:
            RuntimeError: If the ``src_tensor`` does not have the expected
            shape.
        r   z$Expects a 1D expected shape but got r   Nr=  z] rk   ZGradientz needs writeback in z
expected shape=z shape=z expected device=z device=zCannot writeback when the Z	parameterZgradientz shape changes
Expects r   T)r   r   r   r   rA  rB  r2  r   Zget_rankr   r   rC  rD  r   RuntimeErrorr   r#  Zzero_r   rh   rj   )
rN   rf  rg  rh  re  ra  ri  r   Z	src_shapeZ
src_devicer1   r1   r2   rc    s(    
8z!FlatParamHandle._writeback_tensorc                 C   s^   | j s
dS | j}|jdk	std}d}|jD ]}||jdkM }||jO }q,|rTd|_||_dS )a  
        When ``use_orig_params=True``:
        (1) sets the underlying ``flat_param.grad`` to ``None`` if *all* of the
        original parameters' ``.grad`` are ``None``, and
        (2) sets ``flat_param.requires_grad=False`` if *none* of the original
        parameters require gradient.
        For (1), this is targeting ``optim.zero_grad(set_to_none=True)``, in
        which case we want to free the gradients as soon after the
        ``zero_grad()`` call as possible.
        NTF)r   r   re   rj   r@  ro   )rN   r   Zall_grad_nonero   r   r1   r1   r2   %_reset_flat_param_grad_info_if_needed  s    
z5FlatParamHandle._reset_flat_param_grad_info_if_neededc                 C   s^   | j jD ]"}|\}}}t||rt|| q| j jD ]$\}}}}}}t||r4t|| q4d S r   )r   rU   r2  rI  r\   )rN   
param_infor3   r4   r   r1   r1   r2   _deregister_orig_params  s    


z'FlatParamHandle._deregister_orig_paramsc                 O   s>   | j j||| j _| jr:| | j r.|   n| jdd dS )z<Wraps an in-place call to ``.to()`` for ``self.flat_param``.Tr   N)r   r$  rn   r   rb  r   r   )rN   argskwargsr1   r1   r2   r  	  s
    
zFlatParamHandle.flat_param_toc                 C   s&   dd | j jD dd | j jD S )z~
        Returns a :class:`set` of the modules whose parameters are included
        in this handle's flat parameter.
        c                 S   s   h | ]
}|j qS r1   r{   r|   r1   r1   r2   r   	  s     z/FlatParamHandle._get_modules.<locals>.<setcomp>c                 S   s   h | ]
}|j qS r1   r{   r   r1   r1   r2   r   	  s     )r   rU   r   r\   r:  r1   r1   r2   _get_modules	  s    zFlatParamHandle._get_modulesc                 C   s*   t | jdr| jsdS | jj}| |kS )z
        Returns if ``tensor`` is *currently* sharded. For ``NO_SHARD``, we
        choose to have this always return ``False`` for clarity.
        rS   F)r2  r   r   rS   r   )rN   r   sharded_sizer1   r1   r2   rb  	  s    
zFlatParamHandle.is_shardedc                 c   s>   dd | j jD }t| j j|D ]}|\}}}||fV  q d S )Nc                 S   s$   g | ]\}}}}}}t |||qS r1   r%   r}   r3   r4   r5   r   r1   r1   r2   r   )	  s   z6FlatParamHandle.param_module_names.<locals>.<listcomp>)r   r\   r   rU   )rN   ru   rl  r3   r   r5   r1   r1   r2   param_module_names(	  s    	
z"FlatParamHandle.param_module_namesc                 c   s,   dd | j jD D ]\}}}||fV  qd S )Nc                 S   s$   g | ]\}}}}}}t |||qS r1   rr  rs  r1   r1   r2   r   9	  s   z=FlatParamHandle.shared_param_module_names.<locals>.<listcomp>)r   r\   )rN   r3   r   r5   r1   r1   r2   shared_param_module_names8	  s    	z)FlatParamHandle.shared_param_module_namesc                 C   s4   g }t | jj| jjD ]\}}|jr|| q|S )z@Returns the FQNs of the parameters present in this rank's shard.)r   r   rW   r[   r@   r   )rN   Zfqns_in_shardr   r  r1   r1   r2   _fqns_in_shardF	  s     zFlatParamHandle._fqns_in_shardc                 C   s^   | j }t|dr|j}nBt|dr*|j}n0t|jdkpN| j pN| jtj	tj
fkd |j}|S )z&Returns the handle's sharded gradient.rc   rd   NzZSharded strategies should use `_cpu_grad` or `_saved_grad_shard` unless in IDLE or FORWARD)r   r2  rc   rd   r   r@  r   r   r   r5  r   )rN   r   r@  r1   r1   r2   rF  Q	  s     



zFlatParamHandle.sharded_gradc                 C   sf   | j s
dS t| jtjkd | j}|jdk	s0tt|jD ]&\}}|j	r:|j
dk	sVtd|j
|< q:dS )a*  
        Resets ``_is_grad_none_mask`` as needed. This method should only be
        called in the post-backward after gradient computation, in which case
        if a parameter requires gradient, then it will surely receive a
        gradient and we may reset its mask entry to ``False``.
        NzIExpects to only be called in the post-backward after gradient computationF)r   r   r   r   r`  r   re   rj   r  ro   rh   )rN   r   r  r   r1   r1   r2   _reset_is_grad_nonep	  s    
z#FlatParamHandle._reset_is_grad_nonec                 C   s   t | jd d S )NzExpects sharded strategy)r   r   r:  r1   r1   r2   r,  	  s    z'FlatParamHandle._check_sharded_strategy)r   c                 C   s   t |j| jkd| j  d S )Nz+Expects tensor to be on the compute device )r   r   rN   r   r1   r1   r2   r  	  s    

z(FlatParamHandle._check_on_compute_devicec                 C   s"   t |jtdkd|j  d S )Nr  z$Expects tensor to be on CPU but got )r   r   rH   rx  r1   r1   r2   rM  	  s    
zFlatParamHandle._check_on_cpuc                 C   s$   |    }t|dkd|  d S )Nr   z6Expects storage to be freed but got storage with size r   r   r   r   Zstorage_sizer1   r1   r2   r-  	  s
    z$FlatParamHandle._check_storage_freedc                 C   s   |    }t|dkd d S )Nr   zExpects storage to be allocatedry  rz  r1   r1   r2   rU  	  s    z(FlatParamHandle._check_storage_allocatedc                 C   sP   t | jd t t| jdd d k	d | jjj}t || jkd| j d|  d S )Nz&Not using low precision for parametersrb   zExpects `_mp_shard` to existz)Expects the low precision shard to be on r   )r   r  rM   r   rb   r   )rN   r   r1   r1   r2   r"  	  s    
z*FlatParamHandle._check_low_precision_shardc                 C   sH   d}t |d k	|d  | jj}t | |k|d| d|    d S )NzExpects tensor to be unsharded but got `None`
with size r   )r   r   rQ   r   )rN   r   
msg_prefixr7  r1   r1   r2   r?  	  s    
z FlatParamHandle._check_unshardedc                 C   sH   d}t |d k	|d  | jj}t | |k|d| d|    d S )NzExpects tensor to be sharded r{  r|  r   )r   r   rS   r   )rN   r   r}  rq  r1   r1   r2   rE  	  s    
zFlatParamHandle._check_shardedc                 C   s   | j tjkS r   )r   r'   r.   r:  r1   r1   r2   r   	  s    z%FlatParamHandle.uses_sharded_strategyc                 C   s   | j | jkS r   )r   r   r:  r1   r1   r2   r  	  s    z+FlatParamHandle._uses_param_mixed_precisionc                 C   s   | j | jkS r   )r   r   r:  r1   r1   r2   _uses_reduce_mixed_precision	  s    z,FlatParamHandle._uses_reduce_mixed_precisionc                 C   s(   | j s| jo&| jtjkp&| jj o&| jS r   )r  r~  r   r   r  r   Ztrainingr   r:  r1   r1   r2   r  	  s
    
z%FlatParamHandle._force_full_precisionc                 C   s
   | j dk	S )a>  
        This property is used for sharding strategies that do not free after
        forward with ``use_orig_params=True``. This returns if this handle is
        currently in a state where it has skipped using sharded views, in which
        case it can restore view invariants via ``_use_sharded_views()``.
        N)r   r:  r1   r1   r2   r  	  s    z*FlatParamHandle._skipped_use_sharded_views)N)N)hr)   r*   r+   r7   r   r   r:   rk   r   r;   rH   r   r'   rE   r   r   r   ZProcessGroupr   r   r   r   rF   r   r   r   r   r"   r   r   Zno_gradr   r   r?   r   staticmethodr  r   rI   r  r   r   r$   r  r  r!  r  r+  r  r'  r%  r(  r&  r;  r9  rG  rJ  rK  rN  
contextlibcontextmanagerrP  rS  rT  rO  rR  r   r   r   r   r>  r
   r_  r   rH  r  rc  rk  rm  r  r   rp  rb  r8   rt  ru  propertyrv  rF  rw  r,  r  rM  r-  rU  r"  r?  rE  r   r  r~  r  r  __classcell__r1   r1   r   r2   r#     sD  !R	 +2$
 -
>

%V#( 	
6A/
/4  PC5: "2
	
	
)r4   r3   r   rz   c                 C   s"   || j |< ttj| || d S r   )r]  r   r:   r;   __setattr__)r4   r3   r   r1   r1   r2   r   	  s    
r   )r4   r3   r   rz   c                 C   s&   | j |d  ttj| || d S r   )r]  popr   r:   r;   r  )r4   r3   r   r1   r1   r2   r   	  s    r   r4   r3   Ztensor_or_paramc                 C   s$   t | |rt| | t| || d S r   )r2  rI  setattrr  r1   r1   r2   r   	  s    

r   r   c                 C   s   dd | D S )Nc                 S   s&   g | ]}t |tjr|nt|qS r1   )rL   r:   rk   )r}   tr1   r1   r2   r   
  s     z&_convert_to_params.<locals>.<listcomp>r1   )r   r1   r1   r2   r   
  s    r   )param_or_tensorrz   c                 C   s   t | tjr|  S | S r   )rL   r:   rk   detach)r  r1   r1   r2   r   
  s    

r   r   c                 C   s   d}t | }|| }|S )N   )_get_dtype_size)r   Z	ALIGNMENTZunsharded_dtype_sizer   r1   r1   r2   r   
  s    r      c                 C   s   t jd| d S )Nr1   r   )rH   r  Zelement_sizer  r1   r1   r2   r  
  s    r  Zpadding_numelr   ro   r   c                 C   s   t j| f|||dt S )N)r   ro   r   )rH   Zones_FLAT_PARAM_PADDING_VALUEr  r1   r1   r2   r    
  s       r   r   warningc                 C   s   |  | d S r   )r  r  r1   r1   r2   r   /
  s    r   )\r  	functoolsloggingr   rC  enumr   r   	itertoolsr   r   typingr   r   r   r	   r
   r   r   r   r   r   r   r   r   r   rH   Ztorch.distributeddistributedr   Ztorch.nnr:   Ztorch.nn.functionalZ
functionalr
  r   Z$torch.distributed.fsdp._common_utilsr   r   r   r   r   r   Ztorch.distributed.utilsr   r   r   Ztorch.nn.parameterr   Z_fsdp_extensionsr    r!   __all__	getLoggerr)   r   r   r   r   r  r'   r,   r/   Z'RESHARD_AFTER_FORWARD_HANDLE_STRATEGIESr-   r0   rV  r%   r&   r?   r$   rJ   rk   r"   r#   r;   r8   r   r   r   r   r   r   r   	lru_cacher  rF   rE   r   r   Loggerr   r1   r1   r1   r2   <module>   s   @ 	
	
 n                Q  	  

   