U
    *-eR                     @   s  U d dl mZmZ d dlmZmZ d dlmZ d dlmZ d dl	m
Z
mZ d dlmZmZmZmZmZmZmZmZmZ d dlmZ d dlZd dlmZ d dlZd dlmZ d dlm   m!Z" d d	lm#Z# d d
l$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7 G dd deZ8G dd de1Z9ej#j:ej#j:dddZ;e, a<ej=ee/ dddZ>eej?j@eeAef eeAejBf dddZCejDjEZEedd  ZFd<d"d#ZGd$d% ZHd&d' ZId=d(d)ZJd>d*d+ZKd!d!d!d!d!d,d,ddd-	d.d/ZLeEjMjeGeEjMjNe
eJeEjOjNeEjPjNe
eKeEjQjNeEjRjNe
eKeEjSjNeEjTje
eIeEjUjeEjVjNe
eJeEjWjNeEjTjNe
eJeEjUjNeEjXjYe
eHeEjZjYeEj[jYe
eHeEj\jYeEj]jYe
eHeEj^jYeEj_jNe
eJeEj`jNeEjajYeLeEj%jYe%iZbejDjcjdjYejDjcjejYhZfeejgjh eid0< e#j:e#j:dd1d2ZjeG d3d4 d4Zkeeee8  e-eeekd5d6d7Zld8Zmd?eee8  eee#j:ge#j:f  ee- d9d:d;ZndS )@    )ABCabstractmethod)contextmanagernullcontext)copy)	dataclass)partialwraps)	AnyCallablecastDictListOptionalSetTupleUnion)make_fxN)fx)native_layer_norm_backward)FakeTensorMode)gradients_tagging)DataParallelDTensorExpandModeParallelMode)	Placement)_PyTreeCodeGen_PyTreeInfoCodeGen)	stateless)NamedMemberAccessorc                   @   sN   e Zd ZdZeeejjejjdddZ	ee
jeej e
jdddZdS )	Overridea  
    Override the tracing and transformation behavior of :meth:`~torch.distributed._spmd.compile`.
    This is useful when any part of the model is not traceable or if you prefer
    to not trace it due to any reason. More specifically, users can implement
    :meth:`torch.distributed._spmd.Override.replacement` to replace an original
    submodule with the return new submodule. The new submodule contains
    operations that users preferred to be traced, which simply be a dummy
    placeholder operator. After tracing, users can implement
    :meth:`torch.distributed._spmd.Override.transform` to transform the traced
    graph, where the dummy placeholder operator serves as an anchor to insert
    new sub-graphs.
    )fqnorig_submodulereturnc                 C   s   dS )a  
        Implement this method to return a new :class:`nn.Module` instance to
        replace the ``orig_submodule`` argument in the model. This helps if
        ``orig_submodule`` is not traceable or should not be traced.

        Args:
            fqn (str): fully quantified name of the submodule.
            orig_submodule (class:`nn.Module`): original submodule instance to replace.

        Returns:
            A new :class:`nn.Module` instance to replace the original one.
        N )selfr"   r#   r%   r%   \/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/torch/distributed/_spmd/api.pyreplacement0   s    zOverride.replacement)gm
flat_stater$   c                 C   s   dS )a.  
        Given a DTensor-expanded graph and sharding schema for every node,
        conduct additional transformation for the sub-graph from the :class:`nn.Module`
        returned by :meth:`torch.distributed._spmd.Override.replacement` if
        necessary.

        Args:
            gm (:class:`fx.Graph`): a DTensor-expanded graph.
            flat_state (List[str, :class:`Tensor`]): a reference to the list of
                flattened state. The elements in ``flat_state`` map to the first
                ``len(flat_state)`` placeholders in the graph. The transformation
                can add state to or remove state from ``flat_state`` as long as
                it keeps ``flat_state`` and the placeholders consistent.

        Returns:
            The :class:`fx.Graph` after transformation.
        Nr%   )r&   r)   r*   r%   r%   r'   	transform@   s    zOverride.transformN)__name__
__module____qualname____doc__r   strtorchnnModuler(   r   GraphModuler   Tensorr+   r%   r%   r%   r'   r!   "   s   r!   c                   @   s$   e Zd ZeedddZdd ZdS )_PyTreeCodeGenOutputsOnly)argsr$   c                 G   s   |S Nr%   )r&   r7   r%   r%   r'   process_inputs\   s    z(_PyTreeCodeGenOutputsOnly.process_inputsc                 C   s   t | ||S r8   )r   
gen_fn_def)r&   Z	free_varsZmaybe_return_annotationr%   r%   r'   r:   `   s    z$_PyTreeCodeGenOutputsOnly.gen_fn_defN)r,   r-   r.   r
   r9   r:   r%   r%   r%   r'   r6   Z   s   r6   )r)   r$   c                 C   s,   t tdd| jjjjdd| j_|   | S )zMove the responsibility of flattening the input arguments from the
    graph module to the caller.

    Example:

        output = gm(my_struct)

        gm = gm(to_caller_flattened_graph_module)

        output = gm(*pytree.flatten(my_struct)[0])
    N)	orig_argsZin_specout_spec)pytree_info)r6   r   Z_graphZ_codegenr=   r<   	recompile)r)   r%   r%   r'   !_to_caller_flattened_graph_moduled   s    


r?   tZ
placementsc                 C   s   |t jt| < d S r8   )dtensor_expand_modeZ_placements_overrideidr@   r%   r%   r'   _override_placements   s    rD   )optnamed_statesparamsc              	   c   sp   | d k	st t| j}|D ]}|| | j|| < q| jd }|d }| |d< z
d V  W 5 ||d< || _X d S )Nr   rG   )AssertionErrorr   stateZparam_groupsvalues)rE   rF   rG   Zorig_statesnZparam_groupZorig_paramsr%   r%   r'   _rematerialize_optimizer   s    


rL   c                  c   s:   dd } t jjj}| jt jj_z
d V  W 5 |t jj_X d S )Nc                   S   s   dS )NTr%   r%   r%   r%   r'   f_true   s    z_enable_compile.<locals>.f_true)r1   _utilsZis_compiling__code__)rM   Zorig_is_compiling_coder%   r%   r'   _enable_compile   s    

rP      c                 C   s4   t jj| ||d}t| |D ]\}}|| qd S )N)alpha)aten_foreach_addr   zipcopy_)r&   otherrR   self_updatedss_ur%   r%   r'   _foreach_add_decomp   s    r[   c                 C   s*   | |}t ||D ]\}}|| qd S r8   rU   rV   )opr&   rX   rY   rZ   r%   r%   r'   _foreach_unaop_decomp   s    r^   c                 C   s,   | ||}t ||D ]\}}|| qd S r8   r\   )r]   r&   rW   rX   rY   rZ   r%   r%   r'   _foreach_binop_list_decomp   s    
r_   c                 C   s,   | ||}t ||D ]\}}|| qd S r8   r\   )r]   r&   scalarrX   rY   rZ   r%   r%   r'   _foreach_binop_scalar_decomp   s    
ra   c                 C   s0   | ||||}t ||D ]\}}|| qd S r8   r\   )r]   r&   Ztensor1Ztensor2r`   rX   rY   rZ   r%   r%   r'   _foreach_addcop_scalar_decomp   s    rb   T	lrbeta1beta2weight_decayepsamsgradmaximize
grad_scale	found_infc       	         C   s   | ||||f}t jj| |||||||||	|
||||d}tt||D ]4\}\}}|dkr\qFt||D ]\}}|| qfqFd S )Nrc   rQ   )rS   Z_fused_adamdefault	enumeraterU   rV   )r&   ZgradsZexp_avgsZexp_avg_sqsZmax_exp_avg_sqsZstate_stepsrd   re   rf   rg   rh   ri   rj   rk   rl   Z
orig_tupleZupdated_tupleidxorigupdatedour%   r%   r'   _fused_adam_decomp   s.    rt   DEDUP_TARGETSc                 C   sx   i }| j jD ]^}t|j\}}|jtkr|jf|}||d }|d krT|||< q|| | j 	| q| 
  | S r8   )graphZnodespytreetree_flattenr7   targetru   getZreplace_all_uses_withZ
erase_noder>   )r)   Zargs_to_nodenoder7   _Zargs_keyZunique_noder%   r%   r'   _dedup_collectives,  s    


r}   c                   @   s@   e Zd ZU ejed< ejed< ee	j
j ed< ee	j ed< dS )_CompiledResultr)   modrE   r*   N)r,   r-   r.   r   r4   __annotations__r2   r3   r   r1   optim	Optimizerr   r5   r%   r%   r%   r'   r~   C  s   


r~   )funcmodule_overrideparallel_moder7   kwargsr$   c              
      s<  d\t t|t|  d D ]F}t|tjrJd ksFtd|t|tj	j
r&d kshtd|q&d k	s~tdrt ttjjd d fddd	 tjd
d}tjd
d}i }d k	r
| D ] \}	}
|
jkrj|
 ||	< qt|tfdd}r.dnd}rt tt|tjtjdfdd}t tj||}t tj||}t B tjjd
d( tt|| |td
d|||||}W 5 Q R X W 5 Q R X ||}||||||}t ||g\}}t|}t|}r.D ]}| ||}qt!||S )N)NNr   z%Only support single nn.Module for nowz%Only support single Optimizer for nowz5Couldn't find nn.Module instances from the arguments.)
fqn_prefixmoduler$   c                    sz   D ]p}|  D ]b\}}t|dkr&q| dkr:| d | n|}|||}t|t|krf|| q || qqd S )Nr    .)Znamed_childrenlenr(   rC   Zswap_submodule)r   r   overridenamechildr"   	new_child)accessorr   swapr%   r'   r   e  s    z_compile.<locals>.swapr   F)Zremove_duplicatec                    s   t ||~ r"t||nt \  r6t|nt  | ||}W 5 Q R X |t t| fW  5 Q R  W  5 Q R  S Q R X W 5 Q R X d S r8   )r   Z_reparametrize_modulerL   r   r   list
parametersrJ   )r   rG   buffersrF   r7   r   ret)is_data_parallel_moder   rE   r%   r'   stateless_func  s       z _compile.<locals>.stateless_funcZfakeZsymbolic)argr$   c                    s6    | }dg| j }| j  t 9  < ||S )NrQ   )Zfrom_tensorndimZinput_batch_dimdistZget_world_sizerepeat)r   Zfake_argZarg_dims)data_parallel_mode	fake_moder%   r'   _get_full_batch_arg  s    
z%_compile.<locals>._get_full_batch_arg)Z	check_nan)tracing_modeZdecomposition_tableZ_allow_non_fake_inputs)"rw   rx   r   rJ   
isinstancer2   r3   rH   r1   r   r   r    r0   dictZnamed_parametersZnamed_buffersitemsrI   r   r   r   r5   Ztree_map_onlyrP   ZautogradZdetect_anomalyr   r   SPMD_DECOMP_TABLE	partitionr?   r}   r+   r~   )r   r   r   r7   r   r   rG   r   rF   rK   pr   r   r   r)   Zparams_and_buffersr*   r|   r   r%   )r   r   r   r   r   r   rE   r   r'   _compileK  s    "




    r   Z_compiled_obj)r   gm_transformationr   c                    s   t d fdd}|S )a  
    Compile and optimize a callable, which can be a train step within a training
    loop. This method will extract :class:`nn.Module` and :class:`torch.optim.Optimizer`
    instances from the input arguments and trace operations applied to their
    parameters and states.

    Args:
        module_override (Optional[List[Override]]): a list of Override instances
            that will be applied to the module in order. The :class:`Override`
            objects provide :class:`nn.Module` replacements during tracing and a
            graph transformation function after tracing. (Default: ``None``)
        gm_transformation (Optional[Callable[fx.GraphModule, fx.GraphModule]]):
            a callback that will be called after the original callable is
            compiled and distributed (usually after the first iteration) to
            transform the compiled GraphModule into a new optimized one.
        parallel_mode (Optional[ParallelMode]): a :class:`ParallelMode` object
            that specifies how to parallelize the callable. Each ParallelMode
            would have its own strategy to partition the model and the captured
            graph (Default: ``None``)
    r   c                    s    t   fddS )Nc            	         s   |r| ddnd}d}jtd }|d kr`d}d kr>tn}t |f| |}|jt< |jt| |gd  }t	
  |rr|j|_|s|j| d }n\z|j|d|id }W nB tk
r } z"dt|kr||j| d }W 5 d }~X Y nX |W  5 Q R  S Q R X d S )Nlast_train_stepFTr   Z	last_iter)pop__dict__rz   COMPILED_OBJECT_KEYrB   r   r*   rw   rx   r1   Zno_gradr)   	TypeErrorr0   )	r7   r   r   Z
first_iterZcompiled_objmodeZ	flat_inpsoutpute)r   r   r   r   wrapperr%   r'   r     s0    

 z'compile.<locals>.inner.<locals>.wrapper)r	   r   r   r   r   )r   r   r'   inner  s    *zcompile.<locals>.inner)r   )r   r   r   r   r%   r   r'   compile  s    .r   )rQ   )rQ   )rQ   )NNN)oabcr   r   
contextlibr   r   r   dataclassesr   	functoolsr   r	   typingr
   r   r   r   r   r   r   r   r   Z	functorchr   r1   Ztorch.distributeddistributedr   Z)torch.distributed._functional_collectivesZtorch.nnr2   Ztorch.utils._pytreeutilsZ_pytreerw   r   Ztorch._decomp.decompositionsr   Ztorch._subclasses.fake_tensorr   Z%torch.distributed._spmd.data_parallelr   Z%torch.distributed._spmd.parallel_moder   r   r   Ztorch.distributed._tensorr   Ztorch.fx.graphr   r   r   Ztorch.nn.utilsr   Z%torch.nn.utils._named_member_accessorr    r!   r6   r4   r?   rB   r5   rD   r   r   r0   	ParameterrL   ZopsrS   rP   r[   r^   r_   ra   rb   rt   Z_foreach_add_ZScalarrT   Z_foreach_addcdiv_Z_foreach_addcdivZ_foreach_addcmul_Z_foreach_addcmulZ_foreach_div_Z_foreach_divZ_foreach_mul_Z_foreach_mulZ_foreach_neg_rm   Z_foreach_negZ_foreach_reciprocal_Z_foreach_reciprocalZ_foreach_sqrt_Z_foreach_sqrtZ_foreach_sub_Z_foreach_subZ_fused_adam_r   Zc10d_functionalZ
all_reduceZwait_tensorru   Z_opsZ
OpOverloadr   r}   r~   r   r   r   r%   r%   r%   r'   <module>   s   ,8





.                       &


 %   
