U
    9%e                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZmZ d dlmZ d dlZd dlmZmZ d dlZ d dl!Z d dl"m#  m$Z% d d	l&m'Z' d d
l(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2m3Z3 ddl4m5Z5m6Z6 ddl7m8Z8 ddl9m:Z: ddl6m;Z;m<Z< ddl#m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZH ddlImJZJmKZK eLeMZNejejOddZOe jJjPZPdd ZQdd ZRdd ZSdd ZTdd ZUd d d!dgZVd"d# ZWeeX d$d%d&ZYdd(d)ZZG d*d+ d+Z[G d,d- d-e[Z\G d.d/ d/e[Z]G d0d1 d1e[Z^G d2d3 d3e[Z_G d4d5 d5e[Z`d6e_iZad7d8 Zbd9d: Zcd;d< Zdd=d> ZeejfG d?d@ d@ZgejfG dAdB dBegZhdCdD ZiG dEdF dFehZjejfG dGdH dHejZkG dIdJ dJeZlG dKdL dLeZmeRdMeRdNeRdOeRdPeRdQeRdRdSZndTdU ZoejfG dVdW dWehZpdXdY ZqG dZd[ d[epZrd\d] Zsd^d_ ZtddadbZuejeud'dcZvddde ZwejfG dfdg dgegZxejfG dhdi diexZyejfG djdk dkexZzG dldm dmexZ{ejfG dndo doexZ|ejfG dpdq dqe|Z}ejfG drds dsexZ~G dtdu due}ZG dvdw dwegZejfG dxdy dyeZejfG dzd{ d{eZejfG d|d} d}egZG d~d deZG dd deZG dd deZG dd deZejfG dd degZG dd deZG dd deZG dd degZG dd degZejfG dd deZG dd deZejfG dd deZG dd deZG dd deZejfG dd deZejfG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd degZejfG dd deZejfG dd degZG dd deZddddeeX eeX eeX eXeeeX d	ddZddddddZG dd deZG dd deZG dd deZG dd deZG dd deZG ddĄ deZG ddƄ deZG ddȄ deZG ddʄ deZG dd̄ deZG dd΄ deZejfG ddЄ degZG dd deZG ddӄ deZG ddՄ de jjZG ddׄ d׃ZG ddل dكZG ddۄ deZG dd݄ deZG dd߄ deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZdS )    N)nullcontext)Enum)partial)	signature)
AnyCallableClassVarDictListOptionalSequenceSetTupleUnion)patch)ExprInteger)identity)compute_required_storage_lengthis_boolean_dtypeis_float_dtypemake_channels_last_strides_formake_contiguous_strides_for)get_signature_for_torch_op)CleanDivFloorDivModularIndexing   )configdependencies)index_prevent_reordering)get_device_properties)extract_read_writesvar_builder)argsortcache_on_selfconvert_shape_to_inductorconvert_shape_to_symintdeveloper_warningget_kernel_metadatapad_listlike	sympy_dotsympy_product
sympy_subssympy_symboltry_find_schema)opsVz  prefixc                    s    fdd  |  d S )Nc              	      sf   t | ttfr"| D ]} | qn@t | tjjjttt	j
t	jjjttjjjfsbtdt|  dd S )NzFound zE, which is not a supported top level IR node. See [Note: Inductor IR])
isinstancelisttupletorchZ	_inductorZir
ExpandViewDynamicScalar	TensorBoxsympySymbollogicboolalgBooleanr   AssertionErrortype)nodesnode_check_tensorbox Q/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/torch/_inductor/ir.pyrE   i   s     z%validate_ir.<locals>._check_tensorboxrF   )Znode_or_nodesrF   rD   rG   validate_irh   s    rH   c                    s   t  tst fdd}|S )Nc                     s   t t | |S N)getattrr0   )argskwargsnamerF   rG   fn   s    zops_wrapper.<locals>.fn)r4   strr@   )rN   rO   rF   rM   rG   ops_wrapper   s    rQ   c                    s&   t t| tt|   fdd}|S )Nc                    s0   t  t kst fddtt  D S )Nc                    s   g | ]} |  qS rF   rF   .0i)index	inv_orderrF   rG   
<listcomp>   s     z4inverse_reorder.<locals>.reindex.<locals>.<listcomp>lenr@   rangerU   rV   r[   rG   reindex   s    z inverse_reorder.<locals>.reindex)dictziprZ   rY   orderr]   rF   r\   rG   inverse_reorder   s    rb   c                    s    fdd}|S )Nc                    s0   t  t kst fddtt  D S )Nc                    s   g | ]} |  qS rF   rF   rR   )rU   ra   rF   rG   rW      s     z1same_reorder.<locals>.reindex.<locals>.<listcomp>rX   r[   ra   r[   rG   r]      s    zsame_reorder.<locals>.reindexrF   r`   rF   rc   rG   same_reorder   s    rd   c                    s    fdd}|S )Nc                    s    | S rI   rF   r[   reindex1reindex2rF   rG   r]      s    z fuse_reindexing.<locals>.reindexrF   )rf   rg   r]   rF   re   rG   fuse_reindexing   s    rh         c                    s0   dd t | D   fddtt| D }|S )z
    Convert stride order to fill order
    For channel last format,
    stride order = [3, 0, 2, 1] and fill order = [1, 3, 2, 0]
    c                 S   s   i | ]\}}||qS rF   rF   rS   idxposrF   rF   rG   
<dictcomp>   s      z+stride_order2fill_order.<locals>.<dictcomp>c                    s   g | ]} | qS rF   rF   rR   lookuprF   rG   rW      s     z+stride_order2fill_order.<locals>.<listcomp>)	enumeraterZ   rY   )ra   
fill_orderrF   ro   rG   stride_order2fill_order   s    rs   )seqc                 C   s<   t | }dd tt| D }t|D ]\}}|||< q&|S )z)
    Convert strides to stride order
    c                 S   s   g | ]}d qS rI   rF   rS   _rF   rF   rG   rW      s     z$get_stride_order.<locals>.<listcomp>)r$   rZ   rY   rq   )rt   Z
sorted_idxoutrT   elemrF   rF   rG   get_stride_order   s
    
ry   Tc                    s   | d krd S |st jjj nt  fdd|  D }t| rX fdd|  jD }nt	|}| 
 }|  }t|}t|}tj||||d }|S )Nc                    s   g | ]} |qS rF   rF   rS   sZshape_fnrF   rG   rW      s     z%ir_node_to_tensor.<locals>.<listcomp>c                    s   g | ]} |qS rF   rF   rz   r|   rF   rG   rW      s     )sizestridedtypedevice)r1   graphsizevars	size_hintr   get_sizeis_storage_and_layout
get_layoutr~   r   	get_dtype
get_devicer'   r7   Zempty_stridedZzero_)xguard_shaper}   r~   r   r   trF   r|   rG   ir_node_to_tensor   s(       
r   c                   @   s   e Zd Zdd Zdd ZdS )OptionalAttrc                 C   s
   d| _ d S )NZoptional_attrrM   selfrF   rF   rG   __init__   s    zOptionalAttr.__init__c                 C   s   | j S rI   rM   r   rF   rF   rG   __repr__   s    zOptionalAttr.__repr__N)__name__
__module____qualname__r   r   rF   rF   rF   rG   r      s   r   c                   @   s   e Zd Zdd ZdS )OptionalStringc                 C   s
   d| _ d S )Noptional_stringrM   r   rF   rF   rG   r      s    zOptionalString.__init__Nr   r   r   r   rF   rF   rF   rG   r      s   r   c                   @   s   e Zd Zdd ZdS )OptionalListc                 C   s
   d| _ d S )Noptional_listrM   r   rF   rF   rG   r      s    zOptionalList.__init__Nr   rF   rF   rF   rG   r      s   r   c                   @   s   e Zd Zdd ZdS )OptionalScalarc                 C   s
   d| _ d S )Noptional_scalarrM   r   rF   rF   rG   r      s    zOptionalScalar.__init__Nr   rF   rF   rF   rG   r      s   r   c                   @   s   e Zd Zdd ZdS )OptionalLayoutc                 C   s
   d| _ d S )NZoptional_layoutrM   r   rF   rF   rG   r      s    zOptionalLayout.__init__Nr   rF   rF   rF   rG   r      s   r   c                   @   s   e Zd Zdd ZdS )OptionalTensorc                 C   s
   d| _ d S )NZoptional_tensorrM   r   rF   rF   rG   r      s    zOptionalTensor.__init__Nr   rF   rF   rF   rG   r      s   r   zOptional[Layout]c                 C   s   |st jjr| S |S rI   )r1   r   cpp_wrapper)Zoptional_valuevaluerF   rF   rG   may_convert_to_optional   s    r   c                 C   s.   t | dd rt|  S t| tjr*| jS d S )Nr   )rJ   get_device_typer   r4   r7   r   rA   r   rF   rF   rG   r      s
    r   c                 C   s   t | dkS )Ncudar   r   rF   rF   rG   	is_triton  s    r   c                 C   s   t | dkS )Ncpur   r   rF   rF   rG   is_cpu  s    r   c                   @   s   e Zd ZU e Zeee  ed< e	e
jeejj dddZdd Zdd Zd	d
 Zdd Zdd Zedd Zdd Zdd Zdd ZdS )IRNode_current_origins)originsc                 c   s(   t j}|| B t _z
d V  W 5 |t _X d S rI   )r   r   )r   oldrF   rF   rG   current_origins  s
    

zIRNode.current_originsc                 C   s$   t | j| _tjrt nd | _d S rI   )setr   r   r   debug_ir_traceback	tracebackformat_stackr   rF   rF   rG   __post_init__  s    zIRNode.__post_init__c                 C   s   | j S rI   )r   r   rF   rF   rG   get_traceback  s    zIRNode.get_tracebackc                 C   s6   dt | dd }t|dkr0|d d  d}|gS )Nzorigins=r    @   =   z...)rJ   rY   )r   r   rF   rF   rG   common_repr   s    zIRNode.common_reprc                 C   s6   ||    }tdtt|}t| j d| dS )Nz,
z(
z
))r   indentjoinmaprP   rA   r   r   linesrF   rF   rG   
str_helper'  s    zIRNode.str_helperc                 C   s   ||   kS rI   )get_read_namesr   rN   rF   rF   rG   
is_user_of,  s    zIRNode.is_user_ofc                 C   s   dd |   D S )Nc                 S   s   h | ]
}|j qS rF   rM   )rS   deprF   rF   rG   	<setcomp>1  s     z(IRNode.get_read_names.<locals>.<setcomp>)	get_readsr   rF   rF   rG   r   /  s    zIRNode.get_read_namesc                 C   s   t |  S rI   )r,   r   r   rF   rF   rG   	get_numel3  s    zIRNode.get_numelc                 C   s   t jjt|  dS Nr   r1   r   r   is_expr_static_and_truer;   Eqr   r   rF   rF   rG   is_zero_elements6  s    zIRNode.is_zero_elementsc                 C   s   t dt|  dS )a)  
        If the IRNode refers to data which has not been materialized (e.g.,
        it is a Pointwise/Reduction that could potentially have more
        compute fused into it), realize the IRNode into physical memory,
        ending the possibility of fusing into it, but allowing, e.g., multiple
        users to access the data without having to recompute.

        Check StorageBox.realize for a particularly notable implementation.

        TODO(ezyang): I think, in principle, every IRNode should have an
        implementation of this, and most of the time no-op is OK, but you
        really do have to audit each IRNode for this, so for now, raise
        an error if it's not implemented.  Note that some code in graph.py
        will catch this thrown error and suppress it with a warning.
        zrealize NYI on N)NotImplementedErrorrA   r   rF   rF   rG   realize9  s    zIRNode.realizeN)r   r   r   r   r   r   r   r   __annotations__staticmethod
contextlibcontextmanagerr7   fxNoder   r   r   r   r   r   r%   r   r   r   r   rF   rF   rF   rG   r     s   

r   c                       s   e Zd ZU ejed< ejed< edef ed< e	e
 ed< d ddZ fd	d
ZeZdd Zdd Zdd Zdd Zdd Zedd Zed!ddZedd Zdd Zdd Z  ZS )"Loopsr   r   .inner_fnrangesr   c                    sF     d jj dt j  g fdd|D  d jg S )N'c                    s    g | ]}| d t  | qS =)rJ   )rS   rN   r   rF   rG   rW   Z  s     z!Loops.__str__.<locals>.<listcomp>origin_node=)r   r   rA   rP   r   inner_fn_strorigin_node)r   namesrF   r   rG   __str__S  s    zLoops.__str__c                    s   t    d | _d S rI   superr   r   r   	__class__rF   rG   r   ^  s    
zLoops.__post_init__c                 C   s   | j S rI   r   r   rF   rF   rG   r   d  s    zLoops.get_dtypec                 C   s   | j S rI   r   r   rF   rF   rG   r   g  s    zLoops.get_devicec                 C   s   | j S rI   r   r   rF   rF   rG   get_origin_nodej  s    zLoops.get_origin_nodec                 C   s   | j S rI   r   r   rF   rF   rG   r   m  s    zLoops.get_sizec                 C   s   dS NFrF   r   rF   rF   rG   	is_externp  s    zLoops.is_externc                 O   sJ   | dd }| dd }| ||}||_tjr:|p<t nd |_t|S )Nr   r   )popr   r   r   r   r   r:   create)clsrK   rL   r   tbrrF   rF   rG   r   s  s    
zLoops.createrT   c                    s    fddt | D S )Nc                    s2   g | ]*\}}|d krt dnt  | qS )r   r   )r;   r   r.   )rS   nr{   r2   rF   rG   rW     s   z Loops._index.<locals>.<listcomp>rq   )r   r3   rF   r2   rG   _index~  s    
zLoops._indexc                 C   s   t |  S rI   )rY   r   r   rF   rF   rG   inner_fn_str_len  s    zLoops.inner_fn_str_lenc                 C   s   |  | j}tj| j|S rI   )r   r   r1   KernelFormatterHandlerir_to_stringr   )r   rU   rF   rF   rG   r     s    zLoops.inner_fn_strc              
   C   sl   t tddT |  r>t|  |  |  jW  5 Q R  S t|  |  jW  5 Q R  S W 5 Q R X d S Nallow_indexingT)	r   objectFlexibleLayoutget_reduction_typer"   make_loaderr   get_reduction_sizereadsr   rF   rF   rG   r     s    zLoops.get_reads)r   )rT   )r   r   r   r7   r   r   r   r   r   r
   r   r   r   r   r   r   r   r   r   classmethodr   r   r   r%   r   r   r   __classcell__rF   rF   r   rG   r   L  s(   






r   c                C   s&   |j rttd|S td|S d S )Nnanr   )is_floating_pointr0   constantfloat)rl   r   rF   rF   rG   nop_loader_fn  s    r   c                   @   s4   e Zd Zdd Zdd Zdd Zdd Zd	d
 ZdS )	Pointwisec                 C   s   |   rtt| jdS | jS )Nr   )r   r   r   r   r   r   rF   rF   rG   r     s    zPointwise.make_loaderc                 C   s   g S rI   rF   r   rF   rF   rG   r     s    zPointwise.get_reduction_sizec                 C   s   d S rI   rF   r   rF   rF   rG   r     s    zPointwise.get_reduction_typec                 C   s   |   }t|||||S rI   )r   r0   storer   output_nameindexervarsloaderrF   rF   rG   store_output  s    zPointwise.store_outputc                 C   s,   |   }ttd||}t|| j|| jS FMove this to a given device. Requires that all reads are to constants.override_device)r   r   r   ConstantBufferr  r   r   r   r   r  rF   rF   rG   constant_to_device  s    zPointwise.constant_to_deviceN)r   r   r   r   r   r   r  r  rF   rF   rF   rG   r    s
   r  c                   @   sD   e Zd ZU eee gef ed< dZee	 ed< dd Z
dd ZdS )Scatteroutput_indexerNscatter_modec                 C   s4   |   }ttd||}t|| j|| j| j| jS r	  )	r   r   r   r  r  r   r   r  r  r  rF   rF   rG   r    s    zScatter.constant_to_devicec                 C   s*   |   }tj||| |||| jdS )N)mode)r   r0   r  r  r  r  rF   rF   rG   r    s    zScatter.store_output)r   r   r   r   r
   r   r   r  r   rP   r  r  rF   rF   rF   rG   r    s   
r  c                   @   s   e Zd ZdZdZdZdZdS )ReductionHintr   r   rj   ri   N)r   r   r   INNEROUTER
OUTER_TINYDEFAULTrF   rF   rF   rG   r    s   r  c                   @   s   e Zd ZdZdZdS )TileHintr   r   N)r   r   r   SQUAREr  rF   rF   rF   rG   r    s   r  
logical_ormaximumminimummuladdZbitwise_xor)anymaxminprodsumxor_sumc                    sN   t krt  }n8dkr* fdd}n dkr<dd }ntd |S )N   argminargmaxc           
   
      s   | \}}|\}}dkr&t ||}nt ||}t ||}t rt ||}t ||}	t |t ||	}t |t ||	}t |t |t ||}t |||t |||fS )Nr&  )	r0   ltgteqr   ner  logical_andwhere)
abZa_valueZa_indexZb_valueZb_indexmaskequalZa_isnanZb_isnanr   reduction_typerF   rG   
combine_fn  s$     z,get_reduction_combine_fn.<locals>.combine_fnwelford_combinec                 S   sR   | \}}}|\}}}|| }|| }	||	 }
|||
  || || | |
  |	fS rI   rF   )r.  r/  Za_meanZa_m2Za_weightZb_meanZb_m2Zb_weightdeltaZ
new_weightZ	w2_over_wrF   rF   rG   r4    s    


zunknown reduction_type=)REDUCTION_COMBINE_FNr   )r3  r   r4  rF   r2  rG   get_reduction_combine_fn  s    

r8  c                   @   s<  e Zd ZU ee ed< eed< ejed< e	ed< dd Z
dd Zd	d
 Zdd Zdd Zdd Zdd Zdd Zedd Zedd Zee	jfejejejedef ee ee ee	dddZedd Zedd  Zeeee	e	d!d"d#Zed$d% Zeejejejedef ee ee eee	d&	d'd(Z d)S )*	Reductionreduction_rangesr3  	src_dtypereduction_hintc                 C   s   t j| ddS )N)r   r:  r3  )r   )r   r   r   rF   rF   rG   r   (  s     zReduction.__str__c                 C   s   |   S rI   )r   r   rF   rF   rG   r   -  s    zReduction.__repr__c                 C   s   | j S rI   )r:  r   rF   rF   rG   r   0  s    zReduction.get_reduction_sizec                 C   s   | j S rI   r3  r   rF   rF   rG   r   3  s    zReduction.get_reduction_typec              	   C   s0   t | j| j| j| ||}t ||||S rI   )r0   	reductionr   r;  r3  r   store_reduction)r   r  r  r  reduction_varsr   rF   rF   rG   r?  6  s    
zReduction.store_reductionc                 C   s   t | jt | j S rI   )rY   r   r:  r   rF   rF   rG   index_length?  s    zReduction.index_lengthc                 C   s,   |  | j}|  | jd}tj| j||S )Nr   )r   r   r:  r1   r   r   r   )r   rU   rindexrF   rF   rG   r   B  s    zReduction.inner_fn_strc              	   C   s<   |   }ttd||}t|| j|| j| j| j| j	t
jS r	  )r   r   r   r  r9  r   r   r:  r3  r;  r  r  r  rF   rF   rG   r  K  s    zReduction.constant_to_devicec              	      s  dd }t jj|}	t jjt|}
t| oL|dkoLtjoL||	oL||
}|s\tj	dfS t
| jddd      fdd	} fd
d}|
dkrtj||	|
fS |	ks|
d d krtj	dfS t| ||||||tj	}dd }||\}}|r.||\}}t|dkrFtj	dfS t| | \\}}}d}d}|D ]V}t jj||}t jj||| }tdd |D }|r|d7 }n|d7 }qn||krtj||	|
fS tj||	|
fS d S )Nc                 S   s   t | ttjfS rI   )r4   intr;   r   r   rF   rF   rG   
_is_statice  s    z(Reduction.num_splits.<locals>._is_staticr%  r          i   c           	         s  d}d| }|d krdS | dkr(dS | | kr:}n| | k r d|  }|| d | }| ||  d ||   t | }t| fddd}t|  d	k rt|}q }n8t | }t|fd
dd}t| dk r|}n}| ||  d ||  S )N   rE  rj   r   i    c                    s   t |   S rI   absr   Ztmp_split_sizerF   rG   <lambda>      zFReduction.num_splits.<locals>.inner_reduction_splits.<locals>.<lambda>key   c                    s   t |   S rI   rH  r   max_elements_per_threadrF   rG   rK    rL  2   r;   divisorsr!  rI  r   )	reduction_numel_hint
numel_hint	num_warpsnum_threads
split_sizetarget_blocksZblocks_per_outputrT  closestZmax_elements_per_devicerQ  Zmin_elements_per_deviceZmin_elements_per_threadZnum_smZthreads_per_smrJ  rG   inner_reduction_splits  s6    

z4Reduction.num_splits.<locals>.inner_reduction_splitsc                    s  d}|d }d}d}|| d | }| | k r6}n| | k r | }|| d | }| ||  d ||   t | }	t|	 fddd}
t |
 d	k rt|
}q }n8t | }	t|	fd
dd}
t|
 dk r|
}n}| ||  d ||  S )NrG  rE        r   c                    s   t |   S rI   rH  r   rJ  rF   rG   rK    rL  zFReduction.num_splits.<locals>.outer_reduction_splits.<locals>.<lambda>rM     c                    s   t |   S rI   rH  r   rP  rF   rG   rK    rL  rR  rS  )rU  rV  rW  rX  Zrvals_per_threadZxvals_per_blockZxblocksrY  rZ  rT  r[  r\  rJ  rG   outer_reduction_splits  s4    

z4Reduction.num_splits.<locals>.outer_reduction_splitsrj   c                    s   t d t|  |  |  d| d}| }dd |jD }g }d}t|jdd dD ]b t	 fd	d
|D rV|
 j  jtjjkrVtjj j }|jj}|  |jj|krVd}qV||fS )Nr   r   r}   rN   layoutdatac                 S   s(   g | ] }t |tjrt |tjs|qS rF   )r4   r;   r   NumberrS   r   rF   rF   rG   rW     s    zBReduction.num_splits.<locals>.get_read_indices.<locals>.<listcomp>Fc                 S   s   | j S rI   rM   r   rF   rF   rG   rK    rL  z@Reduction.num_splits.<locals>.get_read_indices.<locals>.<lambda>rM  c                 3   s   | ]}| j jkV  qd S rI   )rU   Zfree_symbolsrg  mdrF   rG   	<genexpr>  s     zAReduction.num_splits.<locals>.get_read_indices.<locals>.<genexpr>T)ComputedBufferr   r   r   r   get_read_writes
range_varssortedr   allappendrU   rN   r1   r   name_to_bufferrd  r~   decide_layout)r   cbread_writesrm  indiceschangedbufZoriginal_striderF   rh  rG   get_read_indices  s2    	z.Reduction.num_splits.<locals>.get_read_indicesr   c                 s   s   | ]}|d kV  qdS )r   NrF   rz   rF   rF   rG   rj    s     z'Reduction.num_splits.<locals>.<genexpr>)r1   r   r   Zsymbolic_hintr,   r   r   Zsplit_reductionsr  r  r!   Zmulti_processor_countr  r9  rY   r   index_vars_squeezer   r   simplify_with_rangesstride_hintskeysro  r  )r   	dst_dtyper;  r   r   r:  r3  reduction_numelrD  rU  rV  Zshould_splitr]  ra  r   rx  ru  rv  rv   r@  Z	num_outerZ	num_innerrT   stridesouterrF   r\  rG   
num_splitsZ  s    	

$$ 
 
 

  zReduction.num_splitsc                    sn   dd D t ||  fdd|dkrbtddt fddfd	d
S S dS )z1Convert inner_fn from a reduction to an pointwisec                 S   s   g | ]}t jj|qS rF   )r1   r   r   Zevaluate_static_shaperS   r   rF   rF   rG   rW     s    z2Reduction._unroll_reduction_fn.<locals>.<listcomp>c                    s,   t  fddtjdd D  D S )Nc                 3   s   | ]} |V  qd S rI   rF   )rS   rB  )rU   value_fnrF   rG   rj  (  s   z=Reduction._unroll_reduction_fn.<locals>.fn.<locals>.<genexpr>c                 S   s   g | ]}t |qS rF   )rZ   r  rF   rF   rG   rW   +  s     z>Reduction._unroll_reduction_fn.<locals>.fn.<locals>.<listcomp>)	functoolsreduce	itertoolsproductr[   )r4  r:  r  r[   rG   rO   %  s    z*Reduction._unroll_reduction_fn.<locals>.fnr&  r'  Nc                    s*   dd |D }| |t  |tjfS )Nc                 S   s   g | ]}t |qS rF   )r;   expandrR   rF   rF   rG   rW   9  s     zDReduction._unroll_reduction_fn.<locals>.value_fn.<locals>.<listcomp>)r0   
index_exprr7   int64rU   rB  )flatten_indexr   rF   rG   r  8  s    z0Reduction._unroll_reduction_fn.<locals>.value_fnc                    s    | d S Nr   rF   r[   )rO   rF   rG   rK  ?  rL  z0Reduction._unroll_reduction_fn.<locals>.<lambda>)r8  FixedLayoutr   contiguous_stridesmake_indexer)r   r:  r3  r;  rF   )r4  r  rO   r   r:  r  rG   _unroll_reduction_fn  s     
zReduction._unroll_reduction_fn.)r   r}  r;  r   r   r:  r3  r<  c	                    sz  t jjt}	|	dkr fdd}
|
d|
d|
d|
dd ks^t d fdd}tj|||t	|d	S |	dkrd
kr fdd}nfdd}t| ||S t
|	tjrt jj|	tjk rt|dkrt| | ||S | | |||	\}}|tjkr:|}|dkr^| | ||||	S tt| |||S )Nr   c                    s(    t jkrt| S  jr t| S t| S rI   )r7   boolr   r   rC  valr}  rF   rG   py_cnstV  s    

z!Reduction.create.<locals>.py_cnstr   )r#  r$  r"  r  z* not supported for zero-dimension tensors!c                    s   t   S rI   r0   r   r[   )r}  r3  rtypes_to_initsrF   rG   const_fnk  s    z"Reduction.create.<locals>.const_fnr   r   r   r   r  c                    s   t d S r   r  r[   r  rF   rG   rO   y  s    zReduction.create.<locals>.fnc                    s   dd D } | |S )Nc                 S   s   g | ]}t d qS r   r;   r   ru   rF   rF   rG   rW     s     z0Reduction.create.<locals>.fn.<locals>.<listcomp>rF   rU   reduction_index)r   r:  rF   rG   rO   ~  s    )r1   r   r   simplifyr,   r|  r@   r  r   r5   r4   r;   r   r   r   Zunroll_reductions_thresholdr  r  r  r  create_multilayerr:   r9  )r   r   r}  r;  r   r   r:  r3  r<  r~  r  r  rO   hintsplitrF   )r}  r   r:  r3  r  rG   r   D  s    
	


   

zReduction.createc                 C   sv   | dkr0t |rtdS t|r$dS t|jS | dkr`t |rHtdS t|rTdS t|jS ddddddd|  S )	N>   r'  r   z-infr   >   r&  r!  infr   r   r   r   )r#  r"  r$  r  welford_reducer5  )r   r   r   r7   iinfor!  r   r3  r   rF   rF   rG   default_accumulator  s*    zReduction.default_accumulatorc                 C   s   | dkrdS t | |S )Nr  r   )r9  r  r  rF   rF   rG   default_value  s    zReduction.default_value)r  rV  r<  returnc                 C   sD   | dkr |dkr |t jkr t jS | dkr@|dkr@|t jkr@t jS |S )NrF  i      )r  r  r  )r  rV  r<  rF   rF   rG   _multilayer_second_step_hint  s    z&Reduction._multilayer_second_step_hintc                    sD   t |gtjjt| d  fdd}|S )Nr   c                    sj   |\}| ^ }| |   fdd}r`t t  tjt tj}t ||S | S d S )Nc                      s    gS rI   rF   rF   )ru  r  	new_indexr]   rF   rG   body  s    zCReduction._multilayer_wrap_loader.<locals>.wrapper_fn.<locals>.body)r0   r(  r  r7   Zint32masked)rU   r  Zreduction_blockr  r0  
block_sizedefaultr  	need_maskr~  r]   )ru  r  rG   
wrapper_fn  s    
z5Reduction._multilayer_wrap_loader.<locals>.wrapper_fn)Viewdynamic_reshape_indexerr1   r   r   r   r;   r   )r   r  r:  r~  r  r  r  r  rF   r  rG   _multilayer_wrap_loader  s    
z!Reduction._multilayer_wrap_loader)	r   r}  r;  r   r   r:  r3  r  r<  c
                    s   t |}
t|
|d  |}| ||}| |||
|||}|tjtjfkrN|ntj}t	||||||f|g||	}|
  |   fdd}tjjt |}| |||	}	t	t|||||g|||	S )a
        Break a large reduction up into multiple smaller reductions
        recursively
        r   c                    s    | |S rI   rF   r  Zintermediate_loaderrF   rG   intermediate_fn=  s    z4Reduction.create_multilayer.<locals>.intermediate_fn)r,   r   r  r  r7   Zfloat16Zbfloat16r   r9  r   r   r   r1   r   r   r   r  r:   )r   r   r}  r;  r   r   r:  r3  r  r<  r~  r  r  r  Zintermediate_dtypeZintermediater  rV  rF   r  rG   r    s\         	
  zReduction.create_multilayerN)!r   r   r   r
   r   r   rP   r7   r   r  r   r   r   r   r?  rA  r   r  r   r  r  r   r  r   r   r   r   r  r  rC  r  r  r  rF   rF   rF   rG   r9     sh   

		
 B
'

y

  
!
r9  c                 C   s   d| krdS dS )Nwelfordri   r   rF   r=  rF   rF   rG   num_reduction_outputsR  s    r  c                
       s   e Zd ZU eed<  fddZdd Zeej	fe
je
jeedef  ee ee eeddd	Zed
d Zee
je
jeedef  ee ee eeedddZ  ZS )WelfordReductionoutput_indexc	           
   
      sF   t  dkr d }	n fdd}	t |||	||||| || _d S )Nr   r   c                    s   t  fddD S )Nc                 3   s   | ]}| V  qd S rI   rF   rS   rO   rl   reduction_idxrF   rG   rj  i  s     z<WelfordReduction.__init__.<locals>.loader.<locals>.<genexpr>)r6   r  	inner_fnsr  rG   r  h  s    z)WelfordReduction.__init__.<locals>.loader)rY   r   r   r  )
r   r   r   r  r   r:  r3  r<  r  r  r   r  rG   r   Y  s    

zWelfordReduction.__init__c              	   C   s:   t | j| j| j| ||}|| j }t ||||S rI   )r0   r>  r   r;  r3  r   r  r?  )r   r  r  r  r@  valuesr   rF   rF   rG   r?  w  s    

z WelfordReduction.store_reduction.)r   r   r  r   r:  r3  r<  c              
      s2  dkst tjjt}fdd}	|dkrX|	d}
|	d}|	d}|
||fS |dkrfdd dkr d |	d|	dfS t fd	d
D S tjd |d\}}t	j
kr||dkr| |S fddtdD }|D ]}|  q|S )N>   r  r5  c                    s"    fdd}t j|tdS )Nc                    s   t  tS rI   )r0   r   r}  rl   r  rF   rG   r     s    z8WelfordReduction.create.<locals>.const.<locals>.inner_fnr  )r  r   r5   )rl   r  r   )r   r   r   r  rG   const  s    z&WelfordReduction.create.<locals>.constr   r   c                    s$    fdd}t jt|tdS )Nc                    s   dd D } | |S )Nc                 S   s   g | ]}t d qS r  r  ru   rF   rF   rG   rW     s     zKWelfordReduction.create.<locals>.copy.<locals>.inner_fn.<locals>.<listcomp>rF   )rl   r  )r  r:  rF   rG   r     s    z7WelfordReduction.create.<locals>.copy.<locals>.inner_fnr  )r  r   r}  r5   )r  r   )r   r   r:  r  rG   copy  s    z%WelfordReduction.create.<locals>.copyr  c                 3   s   | ]} |V  qd S rI   rF   r  )r  rF   rG   rj    s     z*WelfordReduction.create.<locals>.<genexpr>)r3  r~  c                    s(   g | ] }t t |qS rF   )r:   r   r  )rS   Z
output_idx)r   r   r  r   r<  r:  r3  rF   rG   rW     s   z+WelfordReduction.create.<locals>.<listcomp>ri   )r@   r1   r   r   r  r,   r6   r9  r  r  r  r  rZ   r   )r   r   r   r  r   r:  r3  r<  r~  r  Zmeanm2weightr  r  resultsr   rF   )r  r   r   r  r   r<  r:  r3  rG   r     sT    


zWelfordReduction.createc                 C   s   dS )Nr  rF   r  rF   rF   rG   r    s    zWelfordReduction.default_valuer   r   r  r   r:  r3  r  r<  c	              
      s,  t tjjt d }	|	rp|dkrpfdd}
j||d t|
ddt|
ddf|d|dS t	d   t
|t fdd	|D |f g||}|D ]}|  qd
d |D }dd tjjt |}||}t
|tfdd	|D |gd|S )r  r   r5  c                    s   t | S rI   r  )rl   r  r   r   rF   rG   r     s    z4WelfordReduction.create_multilayer.<locals>.constant)r   r   r  c              	   3   s$   | ]}j | d dV  qdS )r   )r  N)r  )rS   r  )r  r   r~  r:  r  rF   rG   rj  '  s   	z5WelfordReduction.create_multilayer.<locals>.<genexpr>c                 S   s   g | ]}|  qS rF   )r   rR   rF   rF   rG   rW   :  s     z6WelfordReduction.create_multilayer.<locals>.<listcomp>c                 S   s   || |S rI   rF   )rU   r  r  rF   rF   rG   intermediate_loader_fn<  s    zBWelfordReduction.create_multilayer.<locals>.intermediate_loader_fnc                 3   s   | ]}t  | d V  qdS )r  N)r   r   rR   )r  rF   rG   rj  F  s   )r,   r1   r   r   r   r;   r   r  r   r   r  r   r6   r   r   r  )r   r   r   r  r   r:  r3  r  r<  r  r   ZintermediatesrT   Z	i_loadersrV  rF   )r  r   r   r  r~  r:  r  rG   r    sh    

	
  z"WelfordReduction.create_multilayer)r   r   r   rC  r   r   r?  r   r  r  r7   r   r   r   r   r   r
   r   rP   r   r   r  r  r   rF   rF   r   rG   r  V  s4   

	r
r  c                 C   s.   zt | dd W dS  tk
r(   Y dS X d S )NFfreezeT)as_storage_and_layoutr   r   rF   rF   rG   r   R  s
    r   c                 C   s6   zt | dd\}}| W S  tk
r0   Y dS X d S NFr  )r  is_contiguousr   )r   bufferrd  rF   rF   rG    is_contiguous_storage_and_layoutZ  s
    
r  Fc                 C   s   t | trt| j|||dS t | trt | jtr|rv|rV| j  | jj svt	n |dk	rl| j
| n
| j  | | jjfS t | trt| j|d\}}|| jfS tdS )z0Try to simplify x into a StorageBox and a Layoutr  want_contiguousstride_orderNr  )r4   r:   r  re  
StorageBoxBufferfreeze_layoutrd  r  r@   freeze_layout_with_stride_orderrr  ReinterpretViewr   )r   r  r  r  r  rv   rF   rF   rG   r  b  s.    





r  )r  c                 C   s8   zt | dd\}}||W S  tk
r2   Y dS X d S r  )r  is_stride_orderedr   )r   r  r  rd  rF   rF   rG   "is_stride_order_storage_and_layout  s
    r  c                   @   s   e Zd ZU eed< dd Zdd Zdd Zdd	 Zd
d Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"S )#BaseViewre  c                 C   s   t d|  d S )Nzmake_reindexer NYI on r   r   rF   rF   rG   make_reindexer  s    zBaseView.make_reindexerc                    s$   | j   |   fdd}|S )Nc                    s    | S rI   rF   r  innerr]   rF   rG   r    s    z&BaseView.make_indexer.<locals>.indexer)re  r  r  r   r  rF   r  rG   r    s    
zBaseView.make_indexerc                    s$   | j   |   fdd}|S )Nc                    s    | S rI   rF   r  r  rF   rG   r    s    z$BaseView.make_loader.<locals>.loader)re  r   r  r   r  rF   r  rG   r     s    
zBaseView.make_loaderc                 C   s
   | j  S rI   )re  r   r   rF   rF   rG   r     s    zBaseView.get_dtypec                 C   s
   | j  S rI   )re  r   r   rF   rF   rG   r     s    zBaseView.get_devicec                 C   s   d S rI   rF   r   rF   rF   rG   r     s    zBaseView.get_origin_nodec                 C   s
   | j  S rI   re  get_namer   rF   rF   rG   r    s    zBaseView.get_namec                 C   s   | j |S rI   )re  
mark_reuser   usersrF   rF   rG   r    s    zBaseView.mark_reusec                 C   s
   | j  S rI   )re  has_exceeded_max_readsr   rF   rF   rG   r    s    zBaseView.has_exceeded_max_readsc                 C   s
   | j  S rI   re  r   r   rF   rF   rG   r     s    zBaseView.realizec                 C   s
   | j  S rI   )re  realize_hintr   rF   rF   rG   r    s    zBaseView.realize_hintc                 C   s
   | j  S rI   )re  get_storage_numelr   rF   rF   rG   r    s    zBaseView.get_storage_numelc                 C   s
   | j  S rI   )re  r   r   rF   rF   rG   r     s    zBaseView.is_externc              
   C   s:   t tdd" t|  |  jW  5 Q R  S Q R X d S r   )r   r   r   r"   r   r   r   r   rF   rF   rG   r     s
    zBaseView.get_readsc                 C   s   | }t |tr|j}q|S rI   )r4   r  re  r   r   rF   rF   rG   unwrap_view  s    
zBaseView.unwrap_viewc                 C   s0   |   }ttd||}t||  ||  S r	  )r   r   r   r  r  r   r   r  rF   rF   rG   r    s    zBaseView.constant_to_deviceN)r   r   r   r   r   r  r  r   r   r   r   r  r  r  r   r  r  r   r   r  r  rF   rF   rF   rG   r    s"   
		r  c                   @   sB   e Zd ZU ee ed< edd Zedd Z	dd Z
dd	 Zd
S )r8   r}   c                 C   s   t ttj|}|  }dgt|t|  t | }t|t|ksJttt|D ],}|| dkrV|| dk	svt|| ||< qV|S )zReplace `-1` with correct sizesN)r5   r   r;   r  r   rY   r@   rZ   )r   new_sizeold_sizerT   rF   rF   rG   _normalize_size  s    zExpandView._normalize_sizec           
      C   s   |  ||}t|rt|\}}t|t|j }|dks>ttdg| }t|j	|jD ]$\}}|
|dkrt|ntd q\t|j|jt|||j}	t||	S t||S Nr   r   )r  r   r  rY   r}   r@   r;   r   r_   r~   rp  r  r   r   r5   offsetr  r8   )
r   r   r  storage
old_layoutskip
new_strider~   r}   
new_layoutrF   rF   rG   r     s"    
zExpandView.createc                 C   s   | j S rI   r}   r   rF   rF   rG   r     s    zExpandView.get_sizec                    s4   |   }| j   t|t   fdd}|S )Nc                    sT   t | d  } t| t ks$ttt D ]} | dkr0td| |< q0| S )Nr   r   )r5   rY   r@   rZ   r;   r   )rU   rT   actualr  rF   rG   r]     s    z*ExpandView.make_reindexer.<locals>.reindex)r   re  rY   )r   targetr]   rF   r  rG   r    s
    
	zExpandView.make_reindexerN)r   r   r   r
   r   r   r   r  r   r   r   r  rF   rF   rF   rG   r8     s   


r8   c                   @   sB   e Zd ZU ee ed< edd Zedd Zdd Z	dd	 Z
d
S )PermuteViewdimsc                    s   |  |}t|ttt|ks&tt|rvt|\} t j j	 fdd|D  fdd|D  j
}t||S t||S )Nc                    s   g | ]} j | qS rF   r  rR   r  rF   rG   rW   !  s     z&PermuteView.create.<locals>.<listcomp>c                    s   g | ]} j | qS rF   )r~   rR   r  rF   rG   rW   "  s     )_map_neg_dimsr   rZ   rY   r@   r   r  r  r   r   r  r  r  )r   r   r  r  r  rF   r  rG   r     s    

zPermuteView.createc                    s    fdd D S )Nc                    s$   g | ]}|d kr|n
t  | qS r  )rY   )rS   dimr  rF   rG   rW   +  s     z-PermuteView._map_neg_dims.<locals>.<listcomp>rF   )r   r  rF   r  rG   r  )  s    zPermuteView._map_neg_dimsc                    sD   t | | jt tt| jks&t| j   fdd| jD S )Nc                    s   g | ]} | qS rF   rF   rR   r  rF   rG   rW   0  s     z(PermuteView.get_size.<locals>.<listcomp>)r   r  r  rZ   rY   r@   re  r   r   rF   r  rG   r   -  s    &
zPermuteView.get_sizec                    s^   dd t | jD   fddtt| jD  t ttt| jksNt fdd}|S )Nc                 S   s   i | ]\}}||qS rF   rF   )rS   rT   jrF   rF   rG   rn   3  s      z.PermuteView.make_reindexer.<locals>.<dictcomp>c                    s   g | ]} | qS rF   rF   rR   invrF   rG   rW   4  s     z.PermuteView.make_reindexer.<locals>.<listcomp>c                    s    fddD S )Nc                    s   g | ]} | qS rF   rF   rR   r[   rF   rG   rW   8  s     z?PermuteView.make_reindexer.<locals>.reindex.<locals>.<listcomp>rF   r[   r  r[   rG   r]   7  s    z+PermuteView.make_reindexer.<locals>.reindex)rq   r  rZ   rY   r   r@   )r   r]   rF   r  rG   r  2  s
    zPermuteView.make_reindexerN)r   r   r   r
   r   r   r   r   r  r   r  rF   rF   rF   rG   r    s   


r  c                   @   sB   e Zd ZeddddZeeejdf dddZ	d	d
 Z
dS )SqueezeViewNr  c                   sD  t |rt|\}}g }g } d k	rPt ts6tdd krL t|jk sPttt|j|j	D ]`\}\}}	 d kr|dkr|
| |
|	 qb| kr|
| |
|	 qb|dksbtdqbt|j|j|||j}
t||
S  d krt|dd | D S |   dkstt| fddt| D S d S )Nzexpected integer dim argumentr   r   zexpected squeezed size to be 1c                 S   s   g | ]}|d kr|qS r   rF   rz   rF   rF   rG   rW   _  s      z&SqueezeView.create.<locals>.<listcomp>c                    s   g | ]\}}| kr|qS rF   rF   rS   rT   r{   r  rF   rG   rW   b  s      )r   r  r4   rC  r@   rY   r}   rq   r_   r~   rp  r  r   r   r  r  r  r   r   )r   r   r  r  r  r  r  rT   r}   r~   r  rF   r  rG   r   >  s8    



zSqueezeView.create.r  c                    sV   dd | D }dd t | D t|  ttj ttjdf d fdd}||fS )Nc                 S   s   g | ]}|d kr|qS r  rF   rz   rF   rF   rG   rW   f  s      z(SqueezeView.squeezer.<locals>.<listcomp>c                 S   s   g | ]\}}|d kr|qS r  rF   r  rF   rF   rG   rW   g  s      .)rU   r  c                    sV   t | t ks"t|  d tdg  }t| D ]\}}|||< q<t|S )N r   )rY   r@   r;   r   r_   r6   )rU   r  rl   r{   lengthZnot_onerF   rG   r]   j  s
    "
z%SqueezeView.squeezer.<locals>.reindex)rq   rY   r
   r;   r   r   )r}   r  r]   rF   r  rG   squeezerd  s
    &zSqueezeView.squeezerc                 C   s   t dd S )Nzuse SqueezeView.create())r@   )r   re  rF   rF   rG   r   s  s    zSqueezeView.__init__)r   r   r   r   r   r   r   r;   r   r  r   rF   rF   rF   rG   r
  =  s
   %r
  c                   @   sZ   e Zd ZU ee ed< edef ed< dd Zdd Z	dd	 Z
e
Zed
d Zdd ZdS )GenericViewr}   .r]   c                 C   s   | j S rI   )r]   r   rF   rF   rG   r  |  s    zGenericView.make_reindexerc                 C   sB   dd t t| jD }t| |}ddtt| d| S )Nc                 S   s   g | ]}t d | qS )rT   r.   )rS   r   rF   rF   rG   rW     s     z+GenericView.reindex_str.<locals>.<listcomp>zlambda , z: )rZ   rY   r}   r5   r]   r   r   rP   )r   Z	index_oldZ	index_newrF   rF   rG   reindex_str  s    zGenericView.reindex_strc                 C   s$   |  | jd| j d|   gS )Nsize=zreindex=)r   re  r}   r  r   rF   rF   rG   r     s    zGenericView.__str__c                 C   s   | |t ||S rI   )r5   )r   r   r  r]   rF   rF   rG   r     s    zGenericView.createc                 C   s   | j S rI   r  r   rF   rF   rG   r     s    zGenericView.get_sizeN)r   r   r   r
   r   r   r   r   r  r  r   r   r   r   r   rF   rF   rF   rG   r  w  s   

r  c                   @   sH   e Zd Zedd Zedd Zedd Zedd Zed	d
 Z	dS )r  c                 C   s<   t | } t |}tjjjj}|t | dr8| | } | S r   )r;   r  r1   r   r   	shape_envevaluate_exprLt)rl   r}   r  rF   rF   rG   handle_negative_index  s    

zView.handle_negative_indexc                    s   t |ttfst| | |\ }tjj	 |r:|S d|kr^ fdd}| |t||S t
|rt |jtst|\}}t|j|j|t||j}t||S |  |}| |t||S )Nr   c                    s   t dgt  S r   )r6   rY   r[   r  rF   rG   fake_reindex  s    z!View.create.<locals>.fake_reindex)r4   r6   r5   r@   resolve_negative_sizer   r1   r   r   Zstatically_known_list_equalsr  re  ExternKernelAlloc as_contiguous_storage_and_layoutr  r   r   r   r  r  r  r  )r   r   r  r  r  r  r  r]   rF   r  rG   r     s,    
 
zView.createc                 C   s   dd |D }dd | D } t |}tt|D ]8}|| dkr0td||< tt| t|||<  qjq0tjj	
t| t| | |fS )Nc                 S   s   g | ]}t jj|qS rF   r1   r   r   r  r  rF   rF   rG   rW     s     z.View.resolve_negative_size.<locals>.<listcomp>c                 S   s   g | ]}t jj|qS rF   r   r  rF   rF   rG   rW     s     r  r   )r5   rZ   rY   r;   r   r   r,   r1   r   r   guard_equals)r  r  rT   rF   rF   rG   r    s    zView.resolve_negative_sizec              	   C   sZ   z|  ||}W nD ttfk
rT   t|g}|  ||}|  ||}t||}Y nX |S rI   )_dynamic_reshape_indexerr@   
IndexErrorr,   rh   )r   r  r  r]   Zflatrf   rg   rF   rF   rG   r    s    
zView.dynamic_reshape_indexerc                    sD  t jjj}dd tt|D  tt |}t| }g |r|r| }| \}}|dkr	t
d |	||f q:|dkr|	| q:||||kr	| t jj|| q:||||k r$||||k r| \}}	|| | }||	 }qԈ	| t jj|| q:||||krt
d}
|}	t||
| |
| }
||||kr| }	t||
| |
| }
|| }q^t jj|| q:t q:|r| }t jj|d 	t
d q|r| \}}t jj|d qtttt| ks2t fdd}|S )zG
        Perform a reshape entirely by modifying indexing math
        c                 S   s   g | ]}t d | qS viewr  rR   rF   rF   rG   rW     s     z1View._dynamic_reshape_indexer.<locals>.<listcomp>r   r   c                    sH   t | t ks$tt | t ftt|  t fddD S )Nc                 3   s   | ]}t | V  qd S rI   r-   r  replacementsrF   rG   rj    s     zAView._dynamic_reshape_indexer.<locals>.reindex.<locals>.<genexpr>)rY   r@   r^   r_   r6   r[   r  Z	view_exprr'  rG   r]     s    $z.View._dynamic_reshape_indexer.<locals>.reindex)r1   r   r   r   rZ   rY   r5   r_   r   rp  r;   r   r!  r   r@   reversed)r  r  r   Z	stack_newZ	stack_oldZsize_oldvarZsize_newZvar2Z	size_new2Zdivisormodulusr]   rF   r)  rG   r"    s\    




zView._dynamic_reshape_indexerN)
r   r   r   r   r  r   r   r  r  r"  rF   rF   rF   rG   r    s   



r  c                       s   e Zd ZU dZded<  fddZdd ZeZdd	 Zd
d Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Z  ZS )r  z*Pretend our storage has a different layoutLayoutrd  c                    s&   t    t| jtr"| j | _d S rI   )r   r   r4   re  r  r  r   r   rF   rG   r     s    
zReinterpretView.__post_init__c                 C   s   |  | j| jgS rI   )r   re  rd  r   rF   rF   rG   r   $  s
    zReinterpretView.__str__c                 C   s
   | j  S rI   r  r   rF   rF   rG   r  .  s    zReinterpretView.get_namec                 C   s   | j jS rI   rd  r   r   rF   rF   rG   r   1  s    zReinterpretView.get_devicec                 C   s   d S rI   rF   r   rF   rF   rG   r   4  s    zReinterpretView.get_origin_nodec                 C   s   | j jS rI   )rd  r   r   rF   rF   rG   r   7  s    zReinterpretView.get_dtypec                 C   s   t | jjS rI   r5   rd  r}   r   rF   rF   rG   r   :  s    zReinterpretView.get_sizec                 C   s   t | jjS rI   r5   rd  r~   r   rF   rF   rG   
get_stride=  s    zReinterpretView.get_stridec                    s    fdd}|S )Nc                    s    j  }t  || S rI   )rd  r  r0   loadr  rU   r  r   rF   rG   r  A  s    
z+ReinterpretView.make_loader.<locals>.loaderrF   r  rF   r   rG   r   @  s    zReinterpretView.make_loaderc                 C   s
   | j  S rI   rd  r  r   rF   rF   rG   r  G  s    zReinterpretView.make_indexerc                 C   s   | j S rI   rd  r   rF   rF   rG   r   J  s    zReinterpretView.get_layoutc                 C   s   d S rI   rF   r   rF   rF   rG   r  M  s    zReinterpretView.freeze_layoutc              	   C   sX   t jj| jj}t jj| jj}t jj| jj}d| 	  d| d| d| d	S )Nzreinterpret_tensor(r  ))
r1   r   wrapper_codecodegen_shape_tuplerd  r}   r~   Zcodegen_sizevarr  r  )r   r}   r~   r  rF   rF   rG   codegen_referenceP  s    z!ReinterpretView.codegen_reference)r   r   r   __doc__r   r   r   r   r  r   r   r   r   r1  r   r  r   r  r9  r   rF   rF   r   rG   r    s    
r  c                   @   s   e Zd ZedddZdS )	SliceViewr   c                    s|  t dkstz"dkr6|dkr6dkr6|W S W n tk
rL   Y nX tjj}t| | 	  | 	|  }|
|  }|
|
  |dkr||   dkrdkr||   |S t| d   < t|r\t|\}}t|j}	|	   |	 < t|j|j|	|j|j    }
t||
S  fdd}t||dS )Nr   l    r   c                    sD   t | t ks$td|  d t| } |     |  < | S )Nzwrong ndim r  )rY   r@   r5   r[   r  r  startsteprF   rG   r]     s    $z!SliceView.create.<locals>.reindex)r}   r]   )r;   r  r@   	TypeErrorr1   r   r   r5   r   r  Zevaluate_minr   r!  r   r   r  r~   r  r   r   r  r  r;  )r   r   r  r=  endr>  r   r  r  r  r  r]   rF   r<  rG   r   [  s>    

&


zSliceView.createN)r   )r   r   r   r   r   rF   rF   rF   rG   r;  Z  s   r;  c                   @   sL   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dS )BaseConstantc                 C   s   dS NrF   rF   r   rF   rF   rG   r     s    zBaseConstant.get_sizec                 C   s   | j S rI   r   r   rF   rF   rG   r     s    zBaseConstant.get_dtypec                 C   s   | j S rI   r   r   rF   rF   rG   r     s    zBaseConstant.get_devicec                 C   s   d S rI   rF   r   rF   rF   rG   r     s    zBaseConstant.get_origin_nodec                 C   s   d S rI   rF   r  rF   rF   rG   r    s    zBaseConstant.mark_reusec                 C   s   dS r   rF   r   rF   rF   rG   r    s    z#BaseConstant.has_exceeded_max_readsc                 C   s   dS rB  rF   r   rF   rF   rG   r     s    zBaseConstant.get_readsc                 C   s   dS r   rF   r   rF   rF   rG   r     s    zBaseConstant.is_externN)r   r   r   r   r   r   r   r  r  r   r   rF   rF   rF   rG   rA    s   rA  c                   @   sB   e Zd ZU eed< ejed< ejed< dd Zdd Z	dd	 Z
d
S )Constantr   r   r   c                    s    fdd}|S )Nc                    s   t  j jS rI   )r0   r   r   r   r[   r   rF   rG   r    s    z$Constant.make_loader.<locals>.loaderrF   r  rF   r   rG   r     s    zConstant.make_loaderc                 C   s   d S rI   rF   r   rF   rF   rG   r     s    zConstant.realizec                 C   s   t | j| j|S rI   )rC  r   r   r   r   rF   rF   rG   r    s    zConstant.constant_to_deviceN)r   r   r   r   r   r7   r   r   r   r   r  rF   rF   rF   rG   rC    s   


rC  c                   @   s:   e Zd ZU eed< ejed< ejed< dd Zdd Z	dS )	IndexingConstantrU   r   r   c                    s    fdd}|S )Nc                    s   t  j jS rI   )r0   r  rU   r   r[   r   rF   rG   r    s    z,IndexingConstant.make_loader.<locals>.loaderrF   r  rF   r   rG   r     s    zIndexingConstant.make_loaderc                 C   s   t | j| j|S rI   )rE  rU   r   rD  rF   rF   rG   r    s    z#IndexingConstant.constant_to_deviceN)
r   r   r   r   r   r7   r   r   r   r  rF   rF   rF   rG   rE    s
   


rE  c                   @   s   e Zd Zedfejejee ee edddZ	e
dd Zdd ZeZd	d
 Zdd Zdd Zdd Zdd Zdd Zdd ZedddZejdddZdS )r-  r   r   r   r}   r~   r  c                 C   sd   |d ks,t |t |ks,td| d| || _|| _tdd |D sNt|| _|| _|| _d S )Nr  	, stride=c                 s   s   | ]}t |ttfV  qd S rI   )r4   r   rC  rz   rF   rF   rG   rj    s     z"Layout.__init__.<locals>.<genexpr>)rY   r@   r   r   ro  r}   _strider  r   r   r   r}   r~   r  rF   rF   rG   r     s    zLayout.__init__c                 C   s   | j S rI   )rH  r   rF   rF   rG   r~     s    zLayout.stridec                 C   sP   d}| j dkrd| j  }t| j d| jj d| j d| j d| j | dS )	Nr   r   z	, offset=z('', z, size=rG  r6  )r  rA   r   r   r   r}   r~   )r   r  rF   rF   rG   r     s
    
4zLayout.__str__c                 C   s>   t | jt| j| jD ] \}}}|dkr||kr dS qdS Nr   FT)r_   r~   r   r  r}   r   leftrightr}   rF   rF   rG   r    s     
 zLayout.is_contiguousc                 C   sR   t | j}|dkrdS t| jt| j| jD ] \}}}|dkr,||kr, dS q,dS )N)r^     Fr   T)rY   r}   r_   r~   r   )r   ndimrM  rN  r}   rF   rF   rG   is_channels_last_contiguous  s    
  z"Layout.is_channels_last_contiguousc                 C   sB   t | jtt| j| jD ] \}}}|dkr||kr dS qdS rK  )r_   r~   r*  r   r  r}   rL  rF   rF   rG   is_transposed  s    zLayout.is_transposedc                 C   s   t | jt |kstdgt | }tt |D ] }tjj| j| ||| < q0tt |d D ]}|| ||d  krb dS qbdS )Nr  r   FT)rY   r~   r@   rZ   r1   r   r   r   )r   ra   stride_orderedrT   rF   rF   rG   r    s    zLayout.is_stride_orderedc                 C   s:   dgt ttdt| jd  }t|g| }| |S r  )r5   r*  rZ   rY   r~   r  r   ra   rF   rF   rG   is_channels_last_stride_ordered  s    "z&Layout.is_channels_last_stride_orderedc                 C   s   t | j| j| j| j| jS rI   )r  r   r   r}   r~   r  r   rF   rF   rG   as_fixed  s    zLayout.as_fixedc                 C   s(   t jstdt| j d|   S )Nzconvert z to FixedLayout first)r   r   r@   rA   r   rV  r  r   rF   rF   rG   r  #  s
    zLayout.make_indexerr  c                 C   s<   | j |j ko:| j|jko:| j|jko:| j|jko:| j|jkS rI   rF  )r   otherrF   rF   rG   __eq__)  s    



zLayout.__eq__c                 C   s   t | j| j| jS rI   )r   r}   r~   r  r   rF   rF   rG   storage_size2  s    zLayout.storage_sizeN)r   r   r   r   r7   r   r   r
   r   r   propertyr~   r   r   r  rQ  rR  r  rU  rV  r  r  rY  r;   rZ  rF   rF   rF   rG   r-    s*   
	
		r-  c                	       sp   e Zd ZdZdedfejejee	e
 e	e f eee	e
 e	e f  ee
ef d fddZdd Z  ZS )	r  z A Tensor layout we cannot changeNr   rF  c                    s*   |d krt |}t ||||| d S rI   )r   r  r   r   rI  r   rF   rG   r   9  s    
zFixedLayout.__init__c                    s    fdd}|S )z1A closure containing math to read a given elementc                    sd   t | t  j  kr$t  jks*n t j}t|  j jD ]\}}}|dkr@|||  }q@|S r  )rY   r~   r}   r@   r  r_   )rU   resultrl   r~   szr   rF   rG   r  N  s    *z)FixedLayout.make_indexer.<locals>.indexerrF   r  rF   r   rG   r  K  s    zFixedLayout.make_indexer)r   r   r   r:  r   r7   r   r   r   r
   r   rC  r   r   r  r   rF   rF   r   rG   r  6  s   
r  c                       sn   e Zd ZdZdZedd Zedd Zedd Zed	d
 Z	dd Z
dd Zdd Zd fdd	Z  ZS )r   z(A Tensor layout we are allowed to changeFc                 C   sP   t | dkrg S tdg}t| dd  D ]}|||d   q,tt|S )Nr   r   r  )rY   r;   r   r*  rp  r5   )sizesZreversed_stridesr}   rF   rF   rG   r  ^  s    z!FlexibleLayout.contiguous_stridesc                 C   sV   t tt| t |ksttd}dgt| }|D ]}|||< || |  }q8|S )z
        Create a stride based on the order the dimensions should be filled in.

        In this format, channels last would be:
            [1, 3, 2, 0]
        r   N)r   rZ   rY   r@   r;   r   )r^  ra   Znext_strider  rT   rF   rF   rG   fill_orderedg  s    
zFlexibleLayout.fill_orderedc                 C   s0   t tt| t |kstt|}t| |S )z
        Create a stride based on the sorted order of a permuted range.

        In this format, channels last would be:
            [3, 0, 2, 1]
        )r   rZ   rY   r@   rs   r   r_  )r^  ra   rr   rF   rF   rG   rS  x  s    zFlexibleLayout.stride_orderedc                 C   sD   t | t |kstdd |D }ttt ||jd}t| |S )z
        Create a stride that has the same stride order as given stride

        For example, if given stride is [1000, 1, 100, 10],
        the fill order should be [1, 3, 2, 0]
        c                 S   s   g | ]}t jj|qS rF   )r1   r   r   r   r  rF   rF   rG   rW     s     z/FlexibleLayout.same_ordered.<locals>.<listcomp>rM  )rY   r@   rn  rZ   __getitem__r   r_  )r^  r~   rr   rF   rF   rG   same_ordered  s    zFlexibleLayout.same_orderedc                 C   s"   t | j| j| j| | j|| jS rI   )r  r   r   r}   rS  r  rT  rF   rF   rG   as_stride_order  s    zFlexibleLayout.as_stride_orderc                 C   s"   t | j| j| j| | j|| jS rI   )r  r   r   r}   r_  r  rT  rF   rF   rG   as_fill_order  s    zFlexibleLayout.as_fill_orderc                 C   s"   t | j| j| j| | j|| jS rI   )r  r   r   r}   ra  r  r   r~   rF   rF   rG   as_same_order  s    zFlexibleLayout.as_same_orderNc                    s2   |rt ||}n
t |}t |||| d S rI   )r   r_  r  r   r   )r   r   r   r}   r  r  r   rF   rG   r     s    
zFlexibleLayout.__init__)N)r   r   r   r:  r   r   r  r_  rS  ra  rb  rc  re  r   r   rF   rF   r   rG   r   Y  s   



			r   c                       s6   e Zd ZdZdd fddZdd Zdd	 Z  ZS )
AliasedLayoutz)Shares the same storage as another tensorr  r$  c                    s,   |  }t |j|j|j|j || _d S rI   )r   r   r   r   r   r}   r~   r%  )r   r%  rd  r   rF   rG   r     s    zAliasedLayout.__init__c                 C   s   |    S rI   )rV  r  r   rF   rF   rG   r    s    zAliasedLayout.make_indexerc                 C   s4   | j  j}|dkrdS ddlm} tjj||S )Nr   Tr   )	ALIGNMENT)	r%  r   r  Z
compile_fxrg  r1   r   r   Zstatically_known_multiple_of)r   r  rg  rF   rF   rG   maybe_guard_aligned  s
    z!AliasedLayout.maybe_guard_aligned)r   r   r   r:  r   r  rh  r   rF   rF   r   rG   rf    s   
rf  c                       st   e Zd Zed fddZejjdd Zej	dddZ
d	dd
dZdd Zedd Zdd Zdd Z  ZS )MutationLayoutr   c                    s@   t  | | | d  || _|   }tj	
| d S rI   )r   r   r   r   r   r   
get_bufferr  r1   r   Zmark_buffer_mutated)r   r   rN   r   rF   rG   r     s    zMutationLayout.__init__c                 C   s
   |   jS rI   )real_layoutr~   r   rF   rF   rG   r~     s    zMutationLayout.striderW  c                 C   s   |    S rI   )rl  rZ  r   rF   rF   rG   rZ    s    zMutationLayout.storage_sizer  c                    s,    fdd  | j }t|ts(td|S )Nc                    sB   t | tr | jS t | tr* |  S t | tr> | jS | S rI   )r4   ri  r   r  r  
MutableBoxre  rj  unwrap_viewsrF   rG   ro    s    




z/MutationLayout.get_buffer.<locals>.unwrap_viewsz%MutationLayout must refer to a buffer)r   r4   r  r@   )r   r\  rF   rn  rG   rk    s    	
zMutationLayout.get_bufferc                 C   s
   |   jS rI   )rk  rd  r   rF   rF   rG   rl    s    zMutationLayout.real_layoutc              	   C   s   |   t|tr|j}t|tr0|| r6d}n|   t|jjt }|rt	j
| | | dd t| | D dj}|   t|jjtstt||j_|jS )NTc                 S   s    g | ]\}}t jj||qS rF   r1   r   r   r!  rS   r.  r/  rF   rF   rG   rW   	  s   z/MutationLayout.realize_into.<locals>.<listcomp>r  )r   r4   r:   re  r  r   r  rd  r   r  r   r   r   r   r_   r   r@   ri  )r   srcdstZ	need_copyrF   rF   rG   realize_into  s(    
	zMutationLayout.realize_intoc                 C   s   | S rI   rF   r   rF   rF   rG   rV  	  s    zMutationLayout.as_fixedc                 C   s
   | j  S rI   )r   r  r   rF   rF   rG   r  	  s    zMutationLayout.make_indexer)r   r   r   r   r   r-  r~   getterr;   r   rZ  rk  rl  r   rt  rV  r  r   rF   rF   r   rG   ri    s   

ri  c                       s   e Zd ZU ee ed< eed<  fddZdd Zdd Z	d	d
 Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Zd+d, Zd-d. Zd/d0 Zd1d2 Zd3d4 Z  Z S )5r  rN   rd  c                    s   t    d | _d S rI   r   r   r   rF   rG   r   	  s    
zBuffer.__post_init__c                 C   s
   | j  S rI   r4  r   rF   rF   rG   r   	  s    zBuffer.make_indexerc                 C   s   | j s
t| j S rI   )rN   r@   r   rF   rF   rG   r  #	  s    
zBuffer.get_namec                 C   s   | j jS rI   r.  r   rF   rF   rG   r   '	  s    zBuffer.get_devicec                 C   s   | j S rI   r   r   rF   rF   rG   r   *	  s    zBuffer.get_origin_nodec                 C   s   t | jdd S )Nr   )rJ   rd  r   rF   rF   rG   r   -	  s    zBuffer.get_dtypec                 C   s   t | jjS rI   r/  r   rF   rF   rG   r   0	  s    zBuffer.get_sizec                 C   s   t | jjS rI   r0  r   rF   rF   rG   r1  3	  s    zBuffer.get_stridec                 C   s   | j S rI   r5  r   rF   rF   rG   r   6	  s    zBuffer.get_layoutc                 C   s   |   S rI   )r   r   rF   rF   rG   r  9	  s    zBuffer.get_storage_numelc                 C   s   dS r   rF   r   rF   rF   rG   r   <	  s    zBuffer.is_externc                 C   s    t | jttfs| j | _d S rI   )r4   rd  MultiOutputLayoutrf  rV  r   rF   rF   rG   r  ?	  s    zBuffer.freeze_layoutc                 C   s"   t | jtst| j|| _d S rI   )r4   rd  r   r@   rb  rT  rF   rF   rG   r  C	  s    z&Buffer.freeze_layout_with_stride_orderc                 C   s"   t | jtst| j|| _d S rI   )r4   rd  r   r@   rc  rT  rF   rF   rG   freeze_layout_with_fill_orderG	  s    z$Buffer.freeze_layout_with_fill_orderc                 C   s"   t | jtst| j|| _d S rI   )r4   rd  r   r@   re  rd  rF   rF   rG   freeze_layout_with_same_orderK	  s    z$Buffer.freeze_layout_with_same_orderc                 C   s   t jjt|  dS r   r   r   rF   rF   rG   r   O	  s    zBuffer.is_zero_elementsc                    s(      rtt  dS  fdd}|S )Nr   c                    s    j  }t j|| S rI   )rd  r  r0   r2  rN   r3  r   rF   rG   r  W	  s    
z"Buffer.make_loader.<locals>.loader)r   r   r   r   r  rF   r   rG   r   R	  s    zBuffer.make_loaderc                 C   s   dS r   rF   r   rF   rF   rG   is_no_op]	  s    zBuffer.is_no_opc                 C   s   |   S rI   )r  r   rF   rF   rG   r9  `	  s    zBuffer.codegen_referencec                 C   s   d S rI   rF   r   rF   rF   rG   rr  c	  s    zBuffer.decide_layoutc                 C   s   t | jtr| jj gS dS rB  )r4   rd  rf  r%  r  r   rF   rF   rG   get_alias_namesf	  s    zBuffer.get_alias_namesc                 C   s   t | jtr| jj gS dS rB  )r4   rd  ri  r   r  r   rF   rF   rG   get_mutation_namesk	  s    zBuffer.get_mutation_namesc              
   C   s8   t tdd  t|  |  W  5 Q R  S Q R X d S r   )r   r   r   r"   r   r   r   rF   rF   rG   rl  p	  s
    zBuffer.get_read_writesc                 C   s
   |   jS rI   )rl  r   r   rF   rF   rG   r   w	  s    zBuffer.get_readsc                 C   s   d S rI   rF   r   rF   rF   rG   r   z	  s    zBuffer.realize)!r   r   r   r   rP   r   r-  r   r  r  r   r   r   r   r1  r   r  r   r  r  rw  rx  r   r   ry  r9  rr  rz  r{  rl  r   r   r   rF   rF   r   rG   r  	  s6   
r  c                   @   s   e Zd ZdS )InputBufferN)r   r   r   rF   rF   rF   rG   r|  ~	  s   r|  c                   @   s    e Zd ZdZdd Zdd ZdS )r  Nc                    s    fdd}|S )Nc                    s(    j  }ttj j j|| S rI   )	rd  r  r0   r2  r1   r   constant_namerN   r  r3  r   rF   rG   r  	  s
    
 z*ConstantBuffer.make_loader.<locals>.loaderrF   r  rF   r   rG   r   	  s    zConstantBuffer.make_loaderc                 C   s   t tj| j|| jS rI   )r  r1   r   r}  rN   rd  rD  rF   rF   rG   r  	  s    z!ConstantBuffer.constant_to_device)r   r   r   r  r   r  rF   rF   rF   rG   r  	  s   	r  c                   @   s   e Zd Zdd ZdS )NoneAsConstantBufferc                 C   s
   t jjjS rI   )r1   r   r7  none_strr   rF   rF   rG   r9  	  s    z&NoneAsConstantBuffer.codegen_referenceN)r   r   r   r9  rF   rF   rF   rG   r~  	  s   r~  c                       s$   e Zd Z fddZdd Z  ZS )ShapeAsConstantBufferc                    s   t    || _d S rI   )r   r   shape)r   r  r   rF   rG   r   	  s    
zShapeAsConstantBuffer.__init__c                 C   s6   t jjt jj| j}t jjr.d| dS |S d S )Nztorch::tensor(r6  )r1   r   r7  Zexpr_printerr   r  r  r   )r   exprrF   rF   rG   r9  	  s    z'ShapeAsConstantBuffer.codegen_reference)r   r   r   r   r9  r   rF   rF   r   rG   r  	  s   r  c                       s   e Zd ZU eed< edd Zdd Z fddZdd	 Z	d
d Z
dd Zdd ZedddZdd Zdd Zdd Zdd Zdd Z  ZS )rk  re  c                 C   s   t |  jS rI   )rY   rl  r   r   rF   rF   rG   	num_reads	  s    zComputedBuffer.num_readsc              
   C   sp   t tddX | j rBt|  | j | j W  5 Q R  S t|  | j W  5 Q R  S W 5 Q R X d S r   )	r   r   r   re  r   r"   get_store_functionr   r   r   rF   rF   rG   rl  	  s    
zComputedBuffer.get_read_writesc                    s>   t | jdo$| jtjjko$|  dk}|r4| j S t  S )Nr   r   )	hasattrre  rN   r1   r   Zmutated_buffersr  r   r   )r   Z
can_inliner   rF   rG   r   	  s    

zComputedBuffer.make_loaderc                 C   s@   | j   }| j r*t| jj| j|S t| jj| j|S d S rI   )	rd  rV  r  re  r   r   r?  rN   r  r  rF   rF   rG   r  	  s    
z!ComputedBuffer.get_store_functionc                    s   t | jtrt| j | j \\ }|  j	}dd |D }t
dd |D sZtfdd|D }|r fdd|D }ddlm} |||  S d	S )
al  
        If our layout is still flexible, try to determine the stride order based on stride orders of reads.

        TODO(jansel): A better algorithm here would look at downstream consumers of this
                      value and try to do global graph-level layout optimization.
                      This is also something just begging to be autotuned.
        c                 S   s0   g | ](}|j tjj kr(tjj|j  nd qS rI   )rN   r1   r   rq  r|  rg  rF   rF   rG   rW   	  s   z1ComputedBuffer.get_fill_order.<locals>.<listcomp>c                 s   s    | ]}t |tjtjfV  qd S rI   )r4   r   StarDep	MemoryDeprg  rF   rF   rG   rj  	  s   z0ComputedBuffer.get_fill_order.<locals>.<genexpr>c                    s.   g | ]&}t |tjrt|jd d  D qS )c                 S   s    i | ]}|d kr|t d qS r  r  rS   vrF   rF   rG   rn   	  s       z<ComputedBuffer.get_fill_order.<locals>.<listcomp>.<dictcomp>)r4   r   r  r-   rU   rg  )r@  rF   rG   rW   	  s    c                    s   g | ]}t jj| qS rF   r1   r   r   r{  rS   r  
index_varsrF   rG   rW   	  s    r   pick_loop_orderN)r4   rd  r   r   ry  re  r   r   rl  r   ro  r@   	schedulerr  )r   rv   r   
reads_bufsZstride_lengthsr  rF   )r  r@  rG   get_fill_order	  s,     


zComputedBuffer.get_fill_orderc                 C   s0   t | jtr,|  }|r$| | n|   d S rI   )r4   rd  r   r  rw  r  rT  rF   rF   rG   rr  	  s
    zComputedBuffer.decide_layoutc              	      s  t jj j dd\}}ttd * t	
  rH|n
|dd |}W 5 Q R X |j  dd |j D }|j |j g }g }g }g }| D ]V\}	}
|	|d kr|rt||	 ||
 q|	|d kst||	 ||
 qttt|gt }t|D ],\}}t|tr(t|d	r(|j||< q(d fd
d	}|| }|||||\}}}||||\}}}t|t|kr|_t j||dd\\}}}t	|||||g|}||f|fS )a  
        This is a main place where we do loop transformations in a
        backend-agnostic way.

        Here we:
            1) Remove any 1 dimensions
            2) Fuse contiguous dimensions together
            3) Reorder dimensions based on stride orders
        qr2   r  Nr   c                 S   s,   g | ]$}|t jj kr$t jj| nd qS rI   )r1   r   rq  r|  )rS   Z
reads_namerF   rF   rG   rW   
  s   z7ComputedBuffer.simplify_and_reorder.<locals>.<listcomp>r   iter_reordering_reindexc           	         s\    | |||\}}}|| } tjj| |t | |\}}}|| } t||}|||fS rI   )_apply_loop_reorderingr1   r   r   _simplify_loopsr    rh   )	Zx_varssupport_varsr^  reordering_reindexZreindex0rf   rg   pruner]   Zindex_formulasmemory_addrsr   rF   rG   simplify_and_reorder4
  s         



zAComputedBuffer.simplify_and_reorder.<locals>.simplify_and_reorderz)N)r   ry  re  r   r   r   r   r  r   LoopBodyr  r   indexing_exprsr  reads_name2exprr|  writes_name2expritemsr@   rp  rd   rZ   rY   rq   r4   rk  r  r  Zindex_vars_no_squeeze)r   rK   
var_rangesr  r  r  Zreduce_varsZ
index_sizeZreduce_sizer  r{   r  rT   Z	reads_bufr  r  Ziter_rangesZiter_reindexr  Zreduce_rangesZreduce_reindexrv   Z	iter_varsrF   r  rG   r   
  s~    
  


    
  
    z#ComputedBuffer.simplify_and_reorderNc           
   
      s  ddl m} |dkrg }z fdd|D }t|t|krRt|d t ksVt|dk	rtt|D ]2}z|| || ||< W qj tk
r   Y qjX qjtt|||}	W nB tk
r   tj	rt
dtt | ttt}	Y nX fdd|	D t|	t|	fS )	zU
        Shuffle the order of loops around to hopefully improve performance.
        r   r  Nc                    s   g | ]}t jj| qS rF   r  r  )r  r  rF   rG   rW   l
  s   z9ComputedBuffer._apply_loop_reordering.<locals>.<listcomp>r   z%Did not simplify complex index:
%s
%sc                    s   g | ]} | qS rF   rF   rR   )r^  rF   rG   rW   
  s     )r  r  rY   r@   rZ   r5   r*  	Exceptionr   debuglogwarningr^   r_   rd   rb   )
r  r  r^  r  r  Zpriority_idxr  r  rT   ra   rF   )r  r^  r  rG   r  Z
  s6    
z%ComputedBuffer._apply_loop_reorderingc                 C   s
   | j  S rI   )re  r   r   rF   rF   rG   r   
  s    z!ComputedBuffer.get_reduction_sizec                 C   s
   | j  S rI   )re  r   r   rF   rF   rG   r   
  s    z!ComputedBuffer.get_reduction_typec                 C   s
   | j  S rI   )re  r   r   rF   rF   rG   ry  
  s    zComputedBuffer.is_no_opc                 C   s   dS NTrF   r   rF   rF   rG   should_allocate
  s    zComputedBuffer.should_allocatec                 C   s   | j |S )r
  )re  r  rD  rF   rF   rG   r  
  s    z!ComputedBuffer.constant_to_device)NN)r   r   r   r   r   r%   r  rl  r   r  r  rr  r  r   r  r   r   ry  r  r  r   rF   rF   r   rG   rk  	  s$   

+Z  ,rk  c                       sX   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Z  ZS )TemplateBufferzt
    Represents a Triton (in the future other type) of template operator
    that we can fuse an epilogue onto.
    c                    s4   t  jd |d t|| _|| _tj| | _	d S )N)rN   rd  )
r   r   InputsKernelunwrap_storageinputsmake_kernel_renderr1   r   register_bufferrN   )r   rd  r  r  r   rF   rG   r   
  s    zTemplateBuffer.__init__c                 C   s   |   S rI   )normalized_read_writesr   rF   rF   rG   rl  
  s    zTemplateBuffer.get_read_writesc                    sL   |   | j   fdd}tj||  ddd}dd | jD |_|S )Nc                    s"   t |dkstt | dS )Nr   Zfake)rY   r@   r0   r  r  r  rN   rF   rG   dummy
  s    z4TemplateBuffer.normalized_read_writes.<locals>.dummyrF   T)	normalizec                 S   s   h | ]}t | qS rF   r   r  r  r  rF   rF   rG   r   
  s     z8TemplateBuffer.normalized_read_writes.<locals>.<setcomp>)r  rd  r  r   r"   r   r  r   )r   r  depsrF   r  rG   r  
  s    
   z%TemplateBuffer.normalized_read_writesc                 C   s   dS r  rF   r   rF   rF   rG   r   
  s    z!TemplateBuffer.get_reduction_sizec                 C   s   d S rI   rF   r   rF   rF   rG   r   
  s    z!TemplateBuffer.get_reduction_typec                 C   s   dS r   rF   r   rF   rF   rG   ry  
  s    zTemplateBuffer.is_no_opc                 C   s   dS r  rF   r   rF   rF   rG   r  
  s    zTemplateBuffer.should_allocatec                 C   s   |   dfd fS rB  )r   r   rF   rF   rG   r  
  s
    z#TemplateBuffer.simplify_and_reorder)r   r   r   r:  r   rl  r  r   r   ry  r  r  r   rF   rF   r   rG   r  
  s   r  c                   @   sJ   e Zd ZU ee ed< dd Zdd Zedd Z	edd	 Z
d
d ZdS )r  r  c                 C   s   t | S rI   r  r  rF   rF   rG   get_read_writes_input
  s    z"InputsKernel.get_read_writes_inputc                    sp   g } j D ]8}t|tr2| fdd|D  q
| | q
tjt|t	 
 ht g d t dS )Nc                    s   g | ]}  |qS rF   )r  r  r   rF   rG   rW   
  s     z0InputsKernel.get_read_writes.<locals>.<listcomp>)Z	op_counts)r  r4   r5   extendrp  r  r   Z
ReadWritesr   r  r  collectionsCounter)r   Zstar_depinputrF   r   rG   rl  
  s    

zInputsKernel.get_read_writesc                 C   sX   t | tr| j} t | tr | j} t | tr>t | ts>t| } t | ttfsTt	| | S rI   )
r4   r:   re  r  r  r  ExternKernelrealize_inputr  r@   r   rF   rF   rG   unwrap_storage_for_input
  s    


z%InputsKernel.unwrap_storage_for_inputc                 C   s@   g }| D ]2}t |tr&dd |D }n
t|}|| q|S )Nc                 S   s   g | ]}t |qS rF   )r  r  rR   rF   rF   rG   rW   
  s     z/InputsKernel.unwrap_storage.<locals>.<listcomp>)r4   r5   r  r  rp  )r  Z
inputs_newr   rF   rF   rG   r  
  s    

zInputsKernel.unwrap_storagec                 C   s   dS r  rF   r   rF   rF   rG   r   
  s    zInputsKernel.is_externN)r   r   r   r
   r  r   r  rl  r   r  r  r   rF   rF   rF   rG   r  
  s   




r  c                   @   s   e Zd Zdd ZdS )	NopKernelc                 C   s   dS r  rF   r   rF   rF   rG   ry  
  s    zNopKernel.is_no_opN)r   r   r   ry  rF   rF   rF   rG   r  
  s   r  c                   @   s0   e Zd ZdZedd Zedd Zdd ZdS )	ConcatKernelzn
    There isn't actually a real kernel for concat, we just change the
    storage for the upstream data.
    c                 C   s  |d   }|d  }t|d  }dg}|| g}d|  krPt|k sVn ttdt|D ]}||  }	|||  t|	t|kst||  |kst||   |ksttt|D ]>}
|
|kr||
 |	|
  ||
< qtj	j
||
 |	|
 ||
< q|||  qdt|}tt|D ]D}|| }t|r0| }t|tr0| r0t|} qvq0td t||||dg d}t|}tt|D ]4}|jj| || t|||| ||  qtj	|j|j_| |jj|j_|S )Nr   r   )r   r   r}   r~   rN   rd  r  )r   r   r5   r   rY   r@   rZ   rp  r1   r   r   r!  r   r  r   r   r4   r  rQ  r   r  r  re  r  rt  r;  r   r  rN   r  )r   r  r  r   r   r  Zoffsets_startZoffsets_endrT   
input_sizer  output_strider   rd  Zconcat_kernelkernelrF   rF   rG   r     sh    
 




zConcatKernel.createc              	   C   s   t |ts(t|r(t|\}}t||}t |ts:t|t |trR| |j|S t |tr|	  t |jj
trt |jtst||j_
|jS tj| | | dd t| | D d}| ||S )Nc                 S   s    g | ]\}}t jj||qS rF   rp  rq  rF   rF   rG   rW   ^  s   z-ConcatKernel.realize_into.<locals>.<listcomp>r  )r4   r  r   r  r@   r:   rt  re  r  r   rd  r   r  rf  r  r   r   r   r   r_   r   )r   rr  rs  r  rd  pwrF   rF   rG   rt  D  s0    



 	zConcatKernel.realize_intoc                 C   s   dS r  rF   r   rF   rF   rG   r  e  s    zConcatKernel.should_allocateN)r   r   r   r:  r   r   rt  r  rF   rF   rF   rG   r    s   
;
 r  c                   @   s  e Zd ZU dZeedf ed< eje	dZ
eeef ed< dZee ed< dd	 Zd
d Zdd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zdd Zd d! Zd"d# Zd$d% Z d&d' Z!d(d) Z"d*d+ Z#d,d- Z$d.d/ Z%e%Z&dS )0r  rF   .constant_args)default_factoryrL   Noutput_viewc                 C   s    t | jtr|   |   d S rI   )r4   rd  r   apply_constraintr  r   rF   rF   rG   rr  o  s    zExternKernel.decide_layoutc                 C   s    t | |\}}|r|| d S rI   )r)   	writeline)r   wrapperZ
origin_strZdetailed_origin_strrF   rF   rG   codegen_commentt  s    zExternKernel.codegen_commentc                 C   s
   t  d S rI   r  r   r  rF   rF   rG   codegeny  s    zExternKernel.codegenc                 C   s:   t j|  |  |  |  |  |  d}|  |S )N)r   r   r   r   r   r   )	r  r   r   r   r   r   r   r   r   )r   r  rF   rF   rG   
copy_input|  s    zExternKernel.copy_inputc                    sn  t |j||j}t|dd\}}d }tjjrL|rLt|tj	j
rLt|||}t|\} g g }	g }
|D ]R}t|t d r|	| qjt|tjrtjjjj|d d}|
| qj fdd}fdd|	D }	|	D ]}t|rt|dd	 qg }|	D ]B}| tjjkr2|tjj|   n|t|dd
 q|||
\}}|||}||	|
||fS )NT)Zreturn_schemasr  )r  c                    sd   g }t | }t |}D ]&}|r0|t| q|t| qt| }|dg |di fS )NrK   rL   )iterrp  nextpytreeZtree_unflattenget)Znew_tensor_argsZnew_non_tensor_argsr\  Z
it_tensorsZit_non_tensorsZ	is_tensorr   )	args_specis_arg_tensorrF   rG   unflatten_args  s    z3ExternKernel.process_kernel.<locals>.unflatten_argsc                    s   g | ]}  |qS rF   r  r  r   rF   rG   rW     s     z/ExternKernel.process_kernel.<locals>.<listcomp>r  r   )r   bind	argumentsr   r1   r   r   r4   r7   _opsZOpOverloadPacketr/   r  Ztree_flattenrp  r   r;   r   r   r  Zcreate_symintnoder   r  r  	constantsr   )r   r  rK   rL   Zbinded_argsrv   ZschemasschemaZ	args_flattensor_argsnon_tensor_argsargr  r   Zexample_argsnew_argsZ
new_kwargsexample_outputrF   )r  r   r  rG   process_kernel  sD    
zExternKernel.process_kernelc           	   	   C   s   t |tstt |tr|S |   tj| dd\}}|d }|	 |}t
jj||}t
jj||}t
jj||}t||| }||krtd||| t t|jt| | | ||ddS )z
        In order to pass this to an extern kernel we need a
        ReinterpretView not a View.  This allows us to avoid some
        unneeded copies.
        r   r2   r   z@convert_to_reinterpret_view failed: stride=%s offset=%s index=%srF  re  rd  )r4   r  r@   r  r  r  r   ry  r   r  r1   r   r   rz  Zstride_varsZ
offset_varr+   r  r  r   re  r  r   r   )	r   r   Z
index_argsr  rm  rU   r  r  expectedrF   rF   rG   convert_to_reinterpret_view  s@    
 
z(ExternKernel.convert_to_reinterpret_viewc                 C   s   |d krt  S t|tjtjjjtfr.t|S t|t	rZt
jtj|j| | dS t|trh|S t|tr~| |jS t|tr|S t|tr|  t| rt| jtsz| |W S  tk
r   Y nX t|tr|  |S | |S )N)r   r   ) r~  r4   r;   r   r=   r>   r?   rC  r  rC  r1   r   Zadd_tensor_constantr7   Ztensorr   r   r   r  r:   r  re  r  r  r   r   r  r  r  r   r  r  r   r   rF   rF   rG   r    s8    




 
zExternKernel.realize_inputc                 C   sD   t |r:t| dkr|S | D ]}|dkr$|  S q$| |S r  )r   rY   r1  r  )r   r   r~   rF   rF   rG   require_stride1  s    
zExternKernel.require_stride1c                 C   sr  |  dkr|S t|rt| tr:t|dd|d |S t| trZ| |rZ|S t| trt| 	 trt
dn(t| 	 tr| 	 |r|S t|tr| |r|S t|trDt|jtrDt|jtsDt| rDt| jtsDz| |j|_| ||W S  tk
rB   Y nX | |}t|dd|d t||snt
|S )Nr   TFr  z<the MutationLayout's real layout shouldn't be FlexibleLayout)r   r   r4   r   r   r  r  r  ri  rl  r@   r|  r:   re  r  r  r  r  r  require_stride_orderr   r  r  )r   r   ra   rF   rF   rG   r  $  sf        
 



z!ExternKernel.require_stride_orderc                 C   s   |  |tS rI   )r  NHWC_STRIDE_ORDERr  rF   rF   rG   require_channels_lastS  s    z"ExternKernel.require_channels_lastc              	   C   s    |  |tttt| S rI   )r  r5   r*  rZ   rY   r   r  rF   rF   rG   require_contiguousW  s    zExternKernel.require_contiguousc                 C   s   d S rI   rF   r   rF   rF   rG   r  [  s    zExternKernel.apply_constraintc                 C   s   t tjjj| jS rI   )r   r1   r   r7  val_to_arg_strr  r   rF   rF   rG   codegen_const_args^  s    zExternKernel.codegen_const_argsc                 C   sf   g }| j D ]H}t|trDdd |D }dd| d}|| q
||  q
||   |S )Nc                 S   s   g | ]}|  qS rF   r9  rR   rF   rF   rG   rW   e  s     z-ExternKernel.codegen_args.<locals>.<listcomp>[r  ])r  r4   r5   r   rp  r9  r  r  )r   rK   r   r   r9  rF   rF   rG   codegen_argsa  s    

zExternKernel.codegen_argsc                 C   s   || j kr| j |S t| dr|| jkr| j|d}|d kr| j|d}t|tksttdt| tt|  S |S td| dd S )Nkwargs_default_valuer   rA   $unsupported default_value arg_type: zarg z6 not found in self.kwargs or self.kwargs_default_value)rL   r  r  r  rP   default_value_mapr@   )r   arg_namer  arg_typerF   rF   rG   get_kwargs_valuem  s$    


zExternKernel.get_kwargs_valuec                 C   sd   g }| j r`tjjrL| js td| jD ]"}| |}|tjj	| q&ndd | j 
 D }|S )Nz0ordered_kwargs_for_cpp_kernel has to be providedc                 S   s(   g | ] \}}| d t jj| qS r   r1   r   r7  r  rS   kr  rF   rF   rG   rW     s   z/ExternKernel.codegen_kwargs.<locals>.<listcomp>)rL   r1   r   r   ordered_kwargs_for_cpp_kernelr@   r  rp  r7  r  r  )r   rL   r  r  rF   rF   rG   codegen_kwargs  s    

zExternKernel.codegen_kwargsc              	   C   sX   t jrTtjjsTtjj|  }tjj|  }|	d| 
  d| d| d d S )Nzassert_size_stride(r  r6  )r   Zsize_assertsr1   r   r   r7  r8  r   r1  r  r  )r   r  r}   r~   rF   rF   rG   codegen_size_asserts  s    z!ExternKernel.codegen_size_assertsc                 C   s   |   }|  }|g g|fS )zD
        get output sizes and strides, for template_codegen
        )r   r1  )r   _sizerH  rF   rF   rG   get_group_stride  s    zExternKernel.get_group_stridec                    s  t jj|  }|  }fdd|D }dd tt|D ttt||jdd}dd t	|D fddttD }fd	d|D | 
 }|}t jj||g\}}}	td
\}
 tt| fdd|D }tt||}|t|fS )zC
        Manually get canonicalization of the output index
        c                    s   g | ]}  |qS rF   )r   r  )r   rF   rG   rW     s     z-ExternKernel.canonicalize.<locals>.<listcomp>c                 S   s   g | ]}t d | qS )dr  rR   rF   rF   rG   rW     s     T)rN  reversec                 S   s   i | ]\}}||qS rF   rF   rk   rF   rF   rG   rn     s      z-ExternKernel.canonicalize.<locals>.<dictcomp>c                    s   g | ]} | qS rF   rF   rR   ro   rF   rG   rW     s     c                    s   g | ]} | qS rF   rF   rR   r  rF   rG   rW     s     cc                    s   g | ]} |qS rF   rF   r  )add_varrF   rG   rW     s     )r1   r   r   r   r1  rZ   rY   rn  r`  rq   r  r  r#   r^   r_   r-   r;   r  r6   )r   r^  r  Zindex_orderra   r  rU   Z	new_sizesr]   r  rv   replacementrF   )r  r  rp   r   rG   canonicalize  s(      
 zExternKernel.canonicalizec                    sP   t  dd }d|g}| fddt D 7 }|d j  |S )Nr  zkernel=c                    s$   g | ]}|j  d t |j  qS r   )rN   rJ   )rS   fieldr   rF   rG   rW     s   z(ExternKernel.__str__.<locals>.<listcomp>r   )rJ   dataclassesfieldsrp  r   r   )r   Zkernel_namer   rF   r   rG   r     s    zExternKernel.__str__)'r   r   r   r  r   r   r   r  r  r^   rL   r	   rP   r  r   r  rr  r  r  r   r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r   r   rF   rF   rF   rG   r  i  s@   


D
,

	
.

	r  c                       s@   e Zd ZU dZee ed< dd Zd
 fdd	Zdd	 Z	  Z
S )ExternKernelOutNr  c                 C   s6   |  | |  |  }|| j|  || j d S rI   )r  r  r  Zgenerate_extern_kernel_outr  r9  r  r   r  rK   rF   rF   rG   r    s    
zExternKernelOut.codegenrF   c	           	         sN   t  d || |||pi  || _tj| | _tjjr>|n|| _	|| _
d S rI   )r   r   r  r  r1   r   r  rN   r   r  r  )	r   rd  r  r  rL   r  r  
cpp_kernelr  r   rF   rG   r     s        zExternKernelOut.__init__c                 C   s   dS r  rF   r   rF   rF   rG   r    s    zExternKernelOut.should_allocate)rF   NNNNrF   )r   r   r   r  r   r  r   r  r   r  r   rF   rF   r   rG   r
    s   
      r
  c                       s&   e Zd Zeejd fddZ  ZS )RandomSeeds)countr   c                    s@   t t j}t jt|t j|gdg |j|j|ggddd d S )Nrb  zaten.randint.low_outzat::randint_out)rd  r  r  r  r  )r7   r  r  r   r   r  r!  r   )r   r  r   Zlimitsr   rF   rG   r     s    zRandomSeeds.__init__)r   r   r   rC  r7   r   r   r   rF   rF   r   rG   r    s   r  c                       s6   e Zd Zdd Zd fdd	Zdd Zd	d
 Z  ZS )r  c                 C   sR   |  | |  |  }tjj|  | j|| 	  t
| jtrN| | d S rI   )r  r  r  r1   r   r7  Zgenerate_extern_kernel_allocr  r  r   r4   rd  r-  r  r  rF   rF   rG   r    s    
   zExternKernelAlloc.codegenrF   Nc                    sH   t  d || |||pi  tj| | _tjjr8|n|| _|| _	d S rI   )
r   r   r  r1   r   r  rN   r   r  r  )r   rd  r  r  rL   r  r  r  r   rF   rG   r     s    
    zExternKernelAlloc.__init__c                 C   s   dS r   rF   r   rF   rF   rG   r  "  s    z!ExternKernelAlloc.should_allocatec                 C   s   t d S rI   r  r   rF   rF   rG   r  %  s    z"ExternKernelAlloc.apply_constraint)rF   NNNrF   )r   r   r   r  r   r  r  r   rF   rF   r   rG   r    s        r  c                       s<   e Zd ZdZdZdd Zdd Zdd Z fd	d
Z  Z	S )InplaceBernoulliFallbackzE
    This needs to be a custom class to handle mutation properly
    zaten.bernoulli_c                 C   sB   dd | j D \}|| j d| ddtt| j d d S )Nc                 s   s   | ]}|  V  qd S rI   r  rS   r   rF   rF   rG   rj  1  s     z3InplaceBernoulliFallback.codegen.<locals>.<genexpr>(r  r6  )r  r  r  r   r   reprr  )r   r  r   rF   rF   rG   r  0  s    $z InplaceBernoulliFallback.codegenc                 C   s   dS r   rF   r   rF   rF   rG   r  6  s    z(InplaceBernoulliFallback.should_allocatec                 C   s   t | jtst| jj fS rI   r4   rd  ri  r@   r   r  r   rF   rF   rG   r{  9  s    z+InplaceBernoulliFallback.get_mutation_namesc                    s0   t  d t|| |g| tj| | _d S rI   )r   r   ri  r  r1   r   r  rN   )r   r   r  r   rF   rG   r   =  s    
z!InplaceBernoulliFallback.__init__)
r   r   r   r:  r  r  r  r{  r   r   rF   rF   r   rG   r  )  s   r  c                       sN   e Zd ZdZdd Zdd Zdd Zdd	d
eee	 e
d fddZ  ZS )ScatterFallbackz
    This needs to be a custom class to handle mutation properly.
    This class handles both aten.scatter_ and aten.scatter_reduce_.
    It also handle the case `src` being a scalar properly.
    c              
   C   st   | j rdd | jD \}}}ndd | jD \}}| jd }|||| jd ||g| j| j| j | jd |   d S )Nc                 s   s   | ]}|  V  qd S rI   r  r  rF   rF   rG   rj  P  s     z*ScatterFallback.codegen.<locals>.<genexpr>c                 s   s   | ]}|  V  qd S rI   r  r  rF   rF   rG   rj  R  s     r   r   r  )src_is_tensorr  r  Zgenerate_scatter_fallbackr  rO   rL   r  )r   r  r   rU   rr  rF   rF   rG   r  N  s    
zScatterFallback.codegenc                 C   s   dS r   rF   r   rF   rF   rG   r  ^  s    zScatterFallback.should_allocatec                 C   sN   |dkr6| j r |d krdnd}qJ|d ks0tdd}n|d k	sFtdd}|S )Naten.scatter_zat::scatter_outzat::scatter_reduce_outz:Expect reduce to be None for aten.scatter_ with scalar srcz5Expect reduce to be not None for aten.scatter_reduce_)r  r@   )r   rO   r  r  rF   rF   rG   get_cpp_kernela  s    zScatterFallback.get_cpp_kernelNTr  include_self)r  r  r  c             	      s   |dkst t|t _tjjrJddd}||kr:|| } || _n| _| _	 jr| fdd|||fD }	|f}
n fdd||fD }	||f}
t
 d t| |	|
||d d	d
g _tj  _d S )N>   r  aten.scatter_reduce_r#  r"  )r  multiplyc                    s   g | ]}  |qS rF   r  r  r   rF   rG   rW     s     z,ScatterFallback.__init__.<locals>.<listcomp>c                    s   g | ]}  |qS rF   r  r  r   rF   rG   rW     s     r  r  r  )r@   r4   r:   r  r1   r   r   r  r  rO   r   r   ri  r  r  r  rN   )r   rO   r   r  rU   rr  r  r  Zget_operator_enumtensorsr  r   r   rG   r   s  s.    

zScatterFallback.__init__)r   r   r   r:  r  r  r  rC  r   rP   r  r   r   rF   rF   r   rG   r  G  s   r  c                       s0   e Zd ZdZdd Zdd Z fddZ  ZS )IndexPutFallbackzQ
    This needs to be a custom class to handle mutation and indices properly
    c                 C   s   dd | j D ^}}}g }t|}t| jD ]6\}}| j| d k	rR|t| q,|tjjj	 q,tjjj
 d| tjjj }	||	|f|  }
||| j|
 d S )Nc                 s   s   | ]}|  V  qd S rI   r  r  rF   rF   rG   rj    s     z+IndexPutFallback.codegen.<locals>.<genexpr>r  )r  r  rq   ru  rp  r  r1   r   r7  r  open_bracketr   Zclosed_bracketr  r  Zwrap_kernel_callr  )r   r  r   r  valid_indicesru  Ziter_valid_indicesrT   rv   Zindices_strrK   rF   rF   rG   r    s    "zIndexPutFallback.codegenc                 C   s   dS r   rF   r   rF   rF   rG   r    s    z IndexPutFallback.should_allocatec                    sp   | _ dd |D } fdd||f|D }t d t| ||f tj  _tjj	rfdnd _
d S )Nc                 S   s   g | ]}|d k	r|qS rI   rF   rR   rF   rF   rG   rW     s      z-IndexPutFallback.__init__.<locals>.<listcomp>c                    s   g | ]}  |qS rF   r  r  r   rF   rG   rW     s     zat::index_put_zaten.index_put_)ru  r   r   ri  r  r1   r   r  rN   r   r  )r   r   ru  r  
accumulater  r  r   r   rG   r     s    zIndexPutFallback.__init__)r   r   r   r:  r  r  r   r   rF   rF   r   rG   r    s   r  c                   @   s    e Zd Zedd Zdd ZdS )
DeviceCopyc                 C   s   |  s(tdd | D r(||S tjj|j tj	|j
 tjj| j tj	| j
 td tt|| | d| |gS )Nc                 s   s(   | ] }|j tjjkot|tjV  qd S rI   )rN   r1   r   r  r4   r   r  rg  rF   rF   rG   rj    s   z$DeviceCopy.create.<locals>.<genexpr>zDeviceCopy in input programrb  )r   ro  r   r  r1   r   Zdevice_typesr  rA   Zadd_device_idxrU   r   r(   r!  r   r   r   r  )r   r   r   rF   rF   rG   r     s"    

zDeviceCopy.createc                 C   sv   |   }t|dkst| jrJ|| j  d|d  dtjjj	  n(||   d|d  dtjjj	  d S )Nr   .copy_(r   r6  )
r  rY   r@   r  r  r9  r1   r   r7  endingr  rF   rF   rG   r    s    " zDeviceCopy.codegenN)r   r   r   r   r   r  rF   rF   rF   rG   r!    s   
r!  c                   @   s   e Zd ZdZdd ZdS )r9   z
    The result of a call to aten._local_scalar_dense.

    This is not yet implemented.  The one model (so far) that calls this
    (fastNLP_Bert) does not actually use the result.  So we expect this
    node to get dead code eliminated.
    c                 C   s   dS rB  rF   r   rF   rF   rG   r     s    zDynamicScalar.get_readsN)r   r   r   r:  r   rF   rF   rF   rG   r9     s   r9   c                       sf   e Zd Zd fdd	Zdd Zdd Zdd	 Zed
d Z fddZ	e
dd Z fddZ  ZS )FallbackKernelNc           	         sP  t  |t|t| d| _t|tjjr2|jn|}t	tj
j|jd |krtjjr`d|j n
d|j | _|d k	rdd |jD | _dd |jD | _dd |jD | _nvt|tjjrt	tjj|jd |krd	|j | _ntd
n8tjjrd| _| | n|jdd d|j | _|| _|d kr8i n|| _tj| j d S )NFzat::zaten.c                 S   s    g | ]}|j s|j|jd qS )rA   r   )
kwarg_only	real_typer  r  rF   rF   rG   rW     s   z+FallbackKernel.__init__.<locals>.<listcomp>c                 S   s   g | ]}|j r|jqS rF   r&  rN   r  rF   rF   rG   rW     s     c                 S   s$   i | ]}|j r|j|j|jd qS r%  )r&  rN   r'  r  r  rF   rF   rG   rn     s    z+FallbackKernel.__init__.<locals>.<dictcomp>ztorch._prims.rng_prims.z.Unable to find HigherOrderOperator kernel nameTz._ops.z.ops..)r   r   r6   use_cpp_op_schemar4   r7   r  Z
OpOverloadZ_overloadpacketrJ   r0   atenr   r1   r   r   r  r  args_default_valuer  r  ZHigherOrderOperatorZ_primsZ	rng_primsr   set_cpp_kernelr   replacer  rL   Zwarn_fallback)	r   rd  r  r  Znontensor_argsr  rL   r  Zop_overload_packetr   rF   rG   r     sT    



zFallbackKernel.__init__c                    s   ddl m} |jjr&td|j ddd  t fdd|jjD sXt|j d	t fd
d|jjD st|j d|jj	| _
|jj| _| j
dd d| j | _||| _dd |jjD | _d S )Nr   )get_cpp_op_schemazmutable z" is not supported with cpp_wrapperc                 S   s   | j d kp| j j S rI   )Z
alias_infoZis_write)r  rF   rF   rG   is_not_write@  s    z3FallbackKernel.set_cpp_kernel.<locals>.is_not_writec                 3   s   | ]} |V  qd S rI   rF   r  r0  rF   rG   rj  C  s    z0FallbackKernel.set_cpp_kernel.<locals>.<genexpr>z< with alias_info arguments is not supported with cpp_wrapperc                 3   s   | ]} |V  qd S rI   rF   r  r1  rF   rG   rj  F  s    z: with alias_info returns is not supported with cpp_wrapperz::rv   c                 S   s   g | ]}|j r|jqS rF   r(  r  rF   rF   rG   rW   Q  s     z1FallbackKernel.set_cpp_kernel.<locals>.<listcomp>)Zcodegen.wrapperr/  Z_schemaZ
is_mutabler@   r   ro  r  ZreturnsrN   r  Zoverload_namecpp_kernel_overlad_namer.  cpp_kernel_keycpp_op_schemar  )r   r  r/  rF   r1  rG   r-  6  s0    






zFallbackKernel.set_cpp_kernelc                 C   s   t | dstd|t| jk s:td| dt| j | j| d }|d kr| j| d }t|tks|tdt| tt|  S |S d S )Nr,  z*self.args_default_value has to be providedzexpected the index z2 to be smaller than len(self.args_default_value): r   rA   r  )r  r@   rY   r,  rP   r  )r   rm   r  r  rF   rF   rG   get_arg_default_valueU  s(     
z$FallbackKernel.get_arg_default_valuec                    s   t jG dd d  fddjD }|j\}}dd |D }tjjrtdrt	|}t	j
}||k rfddt||D }dd |D }|| j| |S )	Nc                   @   s   e Zd ZU eed< dd ZdS )z)FallbackKernel.codegen_args.<locals>.Shimrefc                 S   s   | j S rI   )r6  r   rF   rF   rG   r   l  s    z2FallbackKernel.codegen_args.<locals>.Shim.__repr__N)r   r   r   r   r   r   rF   rF   rF   rG   Shimh  s   
r7  c                    s   g | ]} |  qS rF   r  r  )r7  rF   rG   rW   o  s     z/FallbackKernel.codegen_args.<locals>.<listcomp>c                 S   s   g | ]}t jj|qS rF   r  r  rF   rF   rG   rW   q  s     r,  c                    s   g | ]}  |qS rF   )r5  rR   r   rF   rG   rW   w  s    c                 S   s   g | ]}t jj|qS rF   r  r  rF   rF   rG   rW   z  s     )r  	dataclassr  r  r  r1   r   r   r  rY   r,  rZ   r  rL   update)r   r  rK   rL   Zn_argsZ
n_pos_argsZpos_argsrF   )r7  r   rG   r  g  s     


zFallbackKernel.codegen_argsc                 C   s   | r| d   S t|tjr"|jS t|ttfrdd |D }dd |D }t|dkr`|d S |D ]}|jdkrd|  S qd|d S d S )Nr   c                 S   s   h | ]}t d |qS rI   )r$  find_devicer  rF   rF   rG   r     s     z-FallbackKernel.find_device.<locals>.<setcomp>c                 S   s   g | ]}|r|qS rF   rF   )rS   r   rF   rF   rG   rW     s      z.FallbackKernel.find_device.<locals>.<listcomp>r   r   )	r   r4   r7   Tensorr   r5   r6   rY   rA   )r  r  Zdevicesr   rF   rF   rG   r:    s    

zFallbackKernel.find_devicec                    sH   | j r8|  |  }||  | j|| j| j| j nt	 
| d S rI   )r*  r  r  6generate_extern_kernel_alloc_and_find_schema_if_neededr  r  r4  r3  r2  r   r  r  r   rF   rG   r    s    	zFallbackKernel.codegenc              	      s   t jf}||krtjjnt }|" | j|f||\}}}}	}
W 5 Q R X t||}|sbt	dtt
|||||	|
d fdd  |g S )Nz"Not sure where to find device info)r  c                    s   t ttfr4t fddttD S t tjrjtt	j
jt t  S t trxS d kstdd S d S )Nc                 3   s*   | ]"} | t |fg V  qd S rI   )rA   rR   )generate_outputru  outputrF   rG   rj    s   zAFallbackKernel.create.<locals>.generate_output.<locals>.<genexpr>z+FallbackKernel output type is not supported)r4   r5   r6   rA   rZ   rY   r7   r;  MultiOutputr  r   r   r&   r}   r~   rC  r@   )r>  ru  r=  packed)ru  r>  rG   r=    s&    




z.FallbackKernel.create.<locals>.generate_output)r+  Z*_fused_moving_avg_obs_fq_helper_functionalr1   r   	fake_moder   r  r$  r:  r@   rv  )r   r  rK   rL   Zfake_incorrect_kernelscontextr  r  r  r  r  r   rF   r@  rG   r     s0    
	zFallbackKernel.createc                    s
   t   S rI   )r   r  r   r   rF   rG   r    s    zFallbackKernel.apply_constraint)NN)r   r   r   r   r-  r5  r  r   r:  r  r   r   r  r   rF   rF   r   rG   r$    s   	  A

2r$  c                   @   s   e Zd ZU ejed< dS )rv  r   N)r   r   r   r7   r   r   rF   rF   rF   rG   rv    s   
rv  c                       sF   e Zd Zdd Zdd Zeeedf  d fddZd	d
 Z	  Z
S )r?  c                 C   s   t |dkrz|d \}}|tkr@| | d| d|dd  S |tkrptjj|t|}| ||dd  S t	dn|S d S )Nr   r  r  r   znon supported index type)
rY   r5   codegen_list_tuple_accessr6   r1   r   r7  Zcodegen_tuple_accessrP   r@   )r   basenameru  ZityperT   Ztuple_accessrF   rF   rG   rD    s      
z%MultiOutput.codegen_list_tuple_accessc                 C   sb   t jjj}||   d| | jd  | j 7 }|t jjj7 }t jj	| | 
t jj d S N = r   )r1   r   r7  Zdeclarer  rD  r  ru  r#  r  r  )r   r  linerF   rF   rG   r    s
    
*zMultiOutput.codegen.)ru  c                    s,   t  d ||gd tj| | _|| _d S rB  )r   r   r1   r   r  rN   ru  )r   rd  r  ru  r   rF   rG   r     s    zMultiOutput.__init__c                 C   s   dS r   rF   r   rF   rF   rG   r    s    zMultiOutput.should_allocate)r   r   r   rD  r  r
   r   r   r   r  r   rF   rF   r   rG   r?    s   r?  r:   )	r   r  biaspaddingr~   dilationgroups
transposedoutput_paddingc
                 C   s~  dd }
dd }|   |   |dk	r0|   tjj t|dd}t|dd}t| d }d	t|  k r||ksn td	t|  k r|ksn td	t|  k r|ksn tt||}t||}t||}|	dkrtd	g|}	n,d	t|	  k r|ksn tt|	|}	t	|t
s.t|r\|||}| }|
||||	|||}n@|dk	rrt|ddn|}tjj||||||||	|	}| }d	gtttd
t|d
  }t|g| }t|}W 5 Q R X | ||}| jdkr| jdkst||g}t| | t|t|}||||g}|rP|d
|	 |dk	rf|| n|d	| ||||fS )au  
    This function is a helper function to prepare inputs, layout and constant args
    for convolution post-op fusion's create function, including deciding the output
    layout (channels first or channels last), realizing inputs and make them etc. The
    function only supports the CPU device since conv post-op fusion kernel is only
    supported on CPU right now.
    c                 S   s   t | t |kstdt | }|dks0tdd}d}	g }
|
| |  |
||	 |  td|D ]b}|| d ||d   d }| | d ||d   ||d  d  | ||d   }|
| qfttt|
S )NzExpect input dim == weight dimrj   zExpect input dim > 2r   r   )rY   r@   rp  rZ   r5   r   rC  )output_sizeweight_sizerJ  rN  r~   rK  rL  r  Z	BATCH_DIMZWEIGHT_INPUT_CHANNELS_DIMr  r  r  Zinput_size_drF   rF   rG   _conv_input_size  s(    
z<_prepare_convolution_fusion_create.<locals>._conv_input_sizec                 S   s   |   }t|}|dks td|dkrpg }||d |  ||d |  td|D ]}|||  qZn| dd  }|S )Nrj   zExpect weight dim > 2r   r   )r}   rY   r@   rp  rZ   Z	transpose)Zprepacked_weightrL  Zprepacked_weight_sizer  rP  r  rF   rF   rG   _original_deconv_weight_size.  s    zH_prepare_convolution_fusion_create.<locals>._original_deconv_weight_sizeNTr  rj   r   r   r   )r   r1   r   rB  r   rY   r}   r@   r*   r4   rC  r7   r0   r+  Zconvolutionr5   r*  rZ   r   r  r   rA   r  r   r&   insertrp  )r   r   r  rI  rJ  r~   rK  rL  rM  rN  rQ  rR  x_fakeweight_faker  rP  r  rO  	bias_faker>  req_stride_orderr  r  kernel_layoutr  rF   rF   rG   "_prepare_convolution_fusion_create  s    


"

 $
rY  )r   r  rI  c              	   C   s0  |   |   |dk	r |   tjj~ t|dd}t|dd}|dk	rVt|ddn|}|dk	rxtjjj	|||}ntjjj
	||}| }ddg}	t|}
W 5 Q R X | ||	}| jdkr| jdkst||g}t| | t|t|
}g }|dk	r|| n|d| ||||	fS )z
    This function is a helper function to prepare inputs, layout and constant args
    for linear post-op fusion's create function. The function only supports the CPU device
    since linear post-op fusion kernel is only supported on CPU right now.
    NTr  r   r   r   )r   r1   r   rB  r   r7   r0   r+  Zaddmmr  mmr}   r   r  r   rA   r@   r  r   r&   rp  rS  )r   r   r  rI  rT  rU  rV  r>  rO  rW  r  r  rX  r  rF   rF   rG   _prepare_linear_fusion_create  sH    


 
r[  c                	       sP   e Zd Zd fdd	Zdd Zedddee ee ee edd	d
Z  Z	S )ConvolutionUnaryrF   'torch.ops.mkldnn._convolution_pointwisec                    s(   t  j|||d ddd d| _d| _d S )Nr]  mkldnn::_convolution_pointwiser  r  Zconvolution_pointwisea  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm)r   r   r3  r4  r   rd  r  r  r  r   rF   rG   r     s    zConvolutionUnary.__init__c                 C   s:   | |  | j|  | j| j t| jtr6| 	| d S rI   )
r<  r  r  r  r4  r3  r4   rd  r-  r  r  rF   rF   rG   r    s    zConvolutionUnary.codegenr:   )r   r  rI  padding_stride_	dilation_rL  c              	   C   sR   t | |||||||\}}}}t }t }||t||	t||
g }t|||dS Nrd  r  r  )rY  r   r   r   r\  )r   r   r  rI  rb  rc  rd  rL  attrscalars	algorithmr  r  rX  rv   r   r   rF   rF   rG   r     s,           zConvolutionUnary.create)rF   r]  
r   r   r   r   r  r   r
   rC  r   r   rF   rF   r   rG   r\    s     r\  c                       sp   e Zd Zd
 fdd	Zdd Zeddddee ee ee eee	e
 e	e e	ee  e	e ddd	Z  ZS )ConvolutionBinaryrF   c                    s4   t  j|||d ddd d| _d| _d| _|| _d S )Nz.torch.ops.mkldnn._convolution_pointwise.binaryr^  r_  binaryZconvolution_pointwise_binarya  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& other_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view binary_attr,
                c10::optional<at::Scalar> alpha,
                c10::optional<c10::string_view> unary_attr,
                torch::List<c10::optional<at::Scalar>> unary_scalars,
                c10::optional<c10::string_view> unary_algorithm))r   r   r2  r3  r4  cpp_constant_args)r   rd  r  r  rm  r   rF   rG   r     s    zConvolutionBinary.__init__c                 C   s>   | |  | j|  | j| j| j t| jt	r:| 
| d S rI   )r<  r  r  r  r4  r3  r2  r4   rd  r-  r  r  rF   rF   rG   r  /  s    zConvolutionBinary.codegenr:   r   rX  r  rI  rb  rc  rd  rL  binary_attrbinary_alpha
unary_attrunary_scalarsunary_algorithmc              	   C   s   t | |||||||\}}}}| ||}|d| t }t }t }||	t||
t||t||t||g }t|||dS )Nr   rf  )rY  r  rS  r   r   r   r   rk  )r   r   rX  r  rI  rb  rc  rd  rL  ro  rp  rq  rr  rs  r  r  rX  rW  r   r   r   rF   rF   rG   r   ;  s@           zConvolutionBinary.create)rF   rF   )r   r   r   r   r  r   r
   rC  rP   r   r   r   r   r   rF   rF   r   rG   rk    s&     "
rk  c                       sx   e Zd Zd fdd	Zdd Zdd Zeddddee ee ee ee	e
e e
e	 e
ee  e
e	 d	d
dZ  ZS )ConvolutionBinaryInplacerF   c                    sJ   |d |d g|dd   }t  j|||d ddd d| _d| _d	| _d S )
Nr   r   rj   z/torch.ops.mkldnn._convolution_pointwise_.binaryzmkldnn::_convolution_pointwise_r_  rl  Zconvolution_pointwise_binary_a  
            at::Tensor&(
                at::Tensor& other_t,
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view binary_attr,
                c10::optional<at::Scalar> alpha,
                c10::optional<c10::string_view> unary_attr,
                torch::List<c10::optional<at::Scalar>> unary_scalars,
                c10::optional<c10::string_view> unary_algorithm)r   r   r2  r3  r4  )r   rX  r  r  Zreordered_inputsr   rF   rG   r   h  s    z!ConvolutionBinaryInplace.__init__c                 C   s(   | |  | j|  | j| j| j d S rI   r<  r  r  r  r4  r3  r2  r  rF   rF   rG   r    s    z ConvolutionBinaryInplace.codegenc                 C   s   t | jtst| jj fS rI   r  r   rF   rF   rG   r{    s    z+ConvolutionBinaryInplace.get_mutation_namesr:   rn  c              	   C   s   t | |||||||\}}}}| ||}|d| t }t }t }||	t||
t||t||t||g }tt|d ||dS )Nr   )rX  r  r  )	rY  r  rS  r   r   r   r   rt  ri  )r   r   rX  r  rI  rb  rc  rd  rL  ro  rp  rq  rr  rs  r  r  rv   rW  r   r   r   rF   rF   rG   r     s@           
zConvolutionBinaryInplace.create)rF   )r   r   r   r   r  r{  r   r
   rC  rP   r   r   r   r   r   rF   rF   r   rG   rt  g  s&    $

rt  c                       s2   e Zd Zd fdd	Zdd Zedd Z  ZS )	MKLPackedLinearrF   c                    s(   t  j|||d ddd d| _d| _d S )Nztorch.ops.mkl._mkl_linearzmkl::_mkl_linearr_  Z
mkl_lineara  
            at::Tensor(
                const at::Tensor& self,
                const at::Tensor& mkl_weight_t,
                const at::Tensor& origin_weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                const int64_t prepack_batch_size)r`  r   rd  r  r  r   rF   rG   r     s    zMKLPackedLinear.__init__c                 C   s$   | |  | j|  | j| j d S rI   r<  r  r  r  r4  r3  r  rF   rF   rG   r    s    zMKLPackedLinear.codegenc                 C   s   |  | |}|  | |}| ^ }}| \}}t||g }t|}	|||g}
d |g}tt| | ||	|
|dS re  )	r  r  r   r5   r   rw  r  r   r   )r   r   Zpacked_wZorig_wZ
batch_sizemrv   ocrO  r  r  r  rF   rF   rG   r     s$    
   zMKLPackedLinear.create)rF   )r   r   r   r   r  r   r   r   rF   rF   r   rG   rw    s
    	rw  c                       s:   e Zd Zd
 fdd	Zdd Zedd Zdd	 Z  ZS )LinearUnaryrF   c                    s(   t  j|||d ddd d| _d| _d S )Nz"torch.ops.mkldnn._linear_pointwisemkldnn::_linear_pointwiser_  Zlinear_pointwiseaL  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm)r`  rx  r   rF   rG   r     s    zLinearUnary.__init__c                 C   s$   | |  | j|  | j| j d S rI   ry  r  rF   rF   rG   r    s    zLinearUnary.codegenc                 C   s   |  | |}|  | |}| ^ }}| \}	}||g}
||rL|ndg|g}|d k	rz|  | |}|
| n|dd  tt| | t	||	g d|
|dS )Nr  r   rb  rf  )
r  r  r   rp  rS  r|  r   r   r   r5   )r   r   wr/  rg  rh  ri  rz  icr{  r  r  rF   rF   rG   r     s&    zLinearUnary.createc                 C   s   d S rI   rF   r   rF   rF   rG   r  7  s    zLinearUnary.apply_constraint)rF   )	r   r   r   r   r  r   r   r  r   rF   rF   r   rG   r|    s    	
r|  c                       s>   e Zd ZdZd fdd	Zdd Zedd Zd	d
 Z  Z	S )LinearBinary)torch.ops.mkldnn._linear_pointwise.binaryrF   c                    s.   t  j|||d ddd d| _d| _d| _d S )Nr  r}  r_  rl  Zlinear_pointwise_binarya  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& other_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                c10::string_view attr)
        ru  rx  r   rF   rG   r   >  s    zLinearBinary.__init__c                 C   s(   | |  | j|  | j| j| j d S rI   rv  r  rF   rF   rG   r  W  s    zLinearBinary.codegenc                 C   s   |  | |}|  | |}|  | |}| ^ }}| \}}|||g}	|g}
|d k	r~|  | |}|	| n|
d| tt| | t	||g d|	|
dS )Nr   rb  rf  )
r  r  r   rp  rS  r  r   r   r   r5   )r   r   yr~  r/  rg  rz  r  r{  r  r  rF   rF   rG   r   a  s(    
zLinearBinary.createc                 C   s   d S rI   rF   r   rF   rF   rG   r  |  s    zLinearBinary.apply_constraint)rF   )
r   r   r   r  r   r  r   r   r  r   rF   rF   r   rG   r  ;  s    

r  c                
       sV   e Zd Zd
 fdd	Zdd Zedddee ee ee ee eddd	Z  Z	S )ConvolutionTransposeUnaryrF   c                    s(   t  j|||d ddd d| _d| _d S )Nz1torch.ops.mkldnn._convolution_transpose_pointwisez(mkldnn::_convolution_transpose_pointwiser_  Zconvolution_transpose_pointwisea  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef output_padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm)r`  rx  r   rF   rG   r     s    z"ConvolutionTransposeUnary.__init__c                 C   s$   | |  | j|  | j| j d S rI   ry  r  rF   rF   rG   r    s    z!ConvolutionTransposeUnary.codegenr:   )r   r  rI  rb  output_padding_rc  rd  groups_c                 C   sZ   d}t | |||||||||
\}}}}t }t }||	t||
t||g }t|||dS )NTrf  )rY  r   r   r   r  )r   r   r  rI  rb  r  rc  rd  r  rg  rh  ri  rM  r  r  rX  rv   r   r   rF   rF   rG   r     s<    z ConvolutionTransposeUnary.create)rF   rj  rF   rF   r   rG   r    s    	r  c                       sZ   e Zd Zd fdd	Zdd Zedddddddeee eeeeeeedd	d
Z	  Z
S )MkldnnRnnLayerrF   aten.mkldnn_rnn_layerc                    s   t  ||| || _d S rI   )r   r   r  ra  r   rF   rG   r     s    zMkldnnRnnLayer.__init__c              
   C   s0   | |   d| j dd|   d d S )NrG  r  r  r6  )r  r  r  r   r  r  rF   rF   rG   r    s    $zMkldnnRnnLayer.codegenr:   )r   w0w1w2w3hxcxr  batch_sizesr  hidden_size
num_layers
has_biasesbidirectionalbatch_firsttrainc              	      s\  |  |   |  | |}|  | |}|  | |}|  | |}|  | |}|  |  | |}|   }t|dkstd|\}}}|||g}| }| }g }||||||g}||	|
||||||g	}tt ||ddd }g  |||g}|||t	|t	|g} fddt
t||D }|S )Nri   zExpect lstm input to be 3D)r  r  c                 S   s   t | dkstdt| S )Nri   zExpect output_shape to be 3D)rY   r@   r   )output_shaper  rF   rF   rG   get_strides_of_lstm_output&  s    z9MkldnnRnnLayer.create.<locals>.get_strides_of_lstm_outputc                    s<   g | ]4\}\}}t t  || t|fg qS rF   )r?  r  r   r   r5   )rS   rT   rO  r  ru  rA  r   rF   rG   rW   1  s   
z)MkldnnRnnLayer.create.<locals>.<listcomp>)r  r  r  r   rY   r@   r  rv  r   r   rq   r_   )r   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  Z
seq_lengthZ
mini_batchr  Zhy_shapeZcy_shaperesr  r  r  Zoutput_sizesZoutput_stridesZ	output_irrF   r  rG   r     s\    



zMkldnnRnnLayer.create)rF   r  )r   r   r   r   r  r   r  r
   rC  r   r   rF   rF   r   rG   r    s,      
r  c                       s\   e Zd Zd
 fdd	Zdd Zedeeddddee ee ee eeeddd	Z	  Z
S )QConvPointWisePT2ErF   c                    s"   t |dk| _t ||| dS )a  
        if bias is not None
            - inputs = [x, w, b, weight_scale, weight_zp]
            - const_args is: [stride, padding, dilation, groups, x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, weight_scale, weight_zp]
            - const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        rO  NrY   has_biasr   r   rx  r   rF   rG   r   E  s    zQConvPointWisePT2E.__init__c                 C   sh  dd | j D }g }||   |d }|d }| jr@|d n|d }|d |d  }}|dd  \}	}
}}}}}}}}}}d	| _| d
|  d
|  d
|  d
|  d
|  d
|  d
|	  d
|
  d
|  d
|  d
|  d
|  d
|  d
|  d
|  d
|  }||   d| j d| d t| jt	rd| 
| d S )Nc                 S   s   g | ]}|  qS rF   r  r  rF   rF   rG   rW   Z  s     z.QConvPointWisePT2E.codegen.<locals>.<listcomp>r   r   rj   r  iz"torch.ops.onednn.qconv2d_pointwiser  rG  r  r6  r  r  r  r  r  r  r  r4   rd  r-  r  )r   r  rK   
const_argsr   packed_weightrI  w_scalew_zpr~   rJ  rK  rL  x_scalex_zpo_inv_scaleo_zpfp32_outputrq  rr  rs  r  rF   rF   rG   r  X  sv    
	
"zQConvPointWisePT2E.codegenr:   )r   r  r  r  r  r  rI  rc  rb  rd  rL  r  output_zero_pointc                 C   s   d}d }t | ||||	||
|||
\}}}}|d krN|d |d  |d< |d< n|d |d  |d< |d< |  |  |||g }|||||||||g }|rtj|_t|||dS NFrj   r   r   rf  )rY  r   r7   float32r   r  )r   r   r  r  r  r  r  rI  rc  rb  rd  rL  r  r  r  rq  rr  rs  rM  rN  r  r  rX  rv   rF   rF   rG   r     sJ    zQConvPointWisePT2E.create)rF   )r   r   r   r   r  r   r   rC  r
   r   r   rF   rF   r   rG   r  D  s$    1r  c                       sV   e Zd Zd
 fdd	Zdd Zeddddee ee ee eddd
dd	Z  Z	S )QConvPointWiseBinaryPT2ErF   c                    s"   t |dk| _t ||| dS )a~  
        Needs input/weight/output qparams
        if bias is not None
            - inputs = [x, w, b, accum, w_scale, w_zp]
            - const_args = [stride, padding, dilation, groups, x_scale, x_zp, accum_scale, accum_zp, o_inv_scale, o_zp,
            fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, accum, w_scale, w_zp]
            - const_args = const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, accum_scale,
            accum_zp, o_inv_scale, o_zp, fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
           Nr  rx  r   rF   rG   r     s    z!QConvPointWiseBinaryPT2E.__init__c                 C   s  dd | j D }g }||   |d }|d }| jr@|d n|d }|d |d |d   }}}	|d	d  \}
}}}}}}}}}}}}}}}d
| _| d|  d|  d|  d|  d|  d|  d|  d|	  d|  d|
  d|  d|  d|  d|  d|  d|  d|  d|  d|  d|  d|  }||   d| j d| d t| jt	r| 
| d S )Nc                 S   s   g | ]}|  qS rF   r  r  rF   rF   rG   rW     s     z4QConvPointWiseBinaryPT2E.codegen.<locals>.<listcomp>r   r   rj   r  r  iz)torch.ops.onednn.qconv2d_pointwise.binaryr  rG  r  r6  r  )r   r  rK   r  r   r  rI  accumr  r  r~   rJ  rK  rL  r  r  accum_scaleaccum_zpr  r  r  ro  alpharq  rr  rs  Z	conv_argsrF   rF   rG   r    s    
	
"z QConvPointWiseBinaryPT2E.codegenr:   )
r   r  r  rI  rc  rb  rd  rL  r  r  c                 C   s   d}d }t | |||
||||||
\}}}}| ||}|| |
d krd|d |d  |d< |d< n|d |d  |d< |d< |  |	  |||	g }|||||||||||||g }|rtj|_t|||dS r  )rY  r  rp  r   r7   r  r   r  )r   r   r  r  r  r  r  r  r  r  rI  rc  rb  rd  rL  r  r  r  ro  r  rq  rr  rs  rM  rN  r  r  rX  rW  rF   rF   rG   r     s`    
zQConvPointWiseBinaryPT2E.create)rF   rj  rF   rF   r   rG   r    s    9r  c                       sH   e Zd Zd
 fdd	Zdd Zedeeddddeed	dd	Z  Z	S )QLinearPointwisePT2ErF   c                    s"   t |dk| _t ||| dS )a  
        if bias is not None
            - inputs = [x, w, b, weight_scale, weight_zp]
            - const_args is: [x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, weight_scale, weight_zp]
            - const_args is: [bias, x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        rO  Nr  rx  r   rF   rG   r   o  s    zQLinearPointwisePT2E.__init__c                 C   s8  dd | j D }g }||   |d }|d }| jr@|d n|d }|d |d  }}|dd  \}	}
}}}}}}d	| _| d
|	  d
|
  d
|  d
|  d
|  d
|  d
|  d
|  d
|  d
|  d
|  d
|  }||   d| j d| d t| jt	r4| 
| d S )Nc                 S   s   g | ]}|  qS rF   r  r  rF   rF   rG   rW     s     z0QLinearPointwisePT2E.codegen.<locals>.<listcomp>r   r   rj   r  r  iz"torch.ops.onednn.qlinear_pointwiser  rG  r  r6  r  )r   r  rK   r  r   r  rI  r  r  r  r  r  r  r  rq  rr  rs  r  rF   rF   rG   r    s^    

	
"zQLinearPointwisePT2E.codegenr:   )	r   r  r  r  r  r  rI  r  r  c              	   C   sd   t | |||\}}}}|  |  |||g }|||||	|
|||g }|
rVtj|_t|||dS re  )r[  r   r7   r  r   r  )r   r   r  r  r  r  r  rI  r  r  r  rq  rr  rs  r  r  rX  rv   rF   rF   rG   r     s4    zQLinearPointwisePT2E.create)rF   )
r   r   r   r   r  r   r   rC  r   r   rF   rF   r   rG   r  n  s    )r  c                   @   sB   e Zd ZU dZeed< dd Zdd Zedd Z	d	d
 Z
e
ZdS )rm  zC
    TensorBox / StorageBox allow in-place mutation of Tensors
    re  c                 C   s8   t | j|}t|r|S tt| jj d| dd S )Nr)  z not callable)rJ   re  callableAttributeErrorrA   r   )r   rN   rO   rF   rF   rG   __getattr__  s    zMutableBox.__getattr__c                 C   s
   | j  S rI   r  r   rF   rF   rG   r     s    zMutableBox.realizec                 C   s   | j jS rI   r  r   rF   rF   rG   rd    s    zMutableBox.layoutc                 C   sn   t | jtr8t| j dt| jj d}d}| jj}nt| j d}| j}d}|tt||g}d|S )Nr  z))r6  
)r4   re  rm  rA   r   r   rP   r   )r   Zline0Zendlr  r   rF   rF   rG   r     s    

zMutableBox.__str__N)r   r   r   r:  r   r   r  r   r[  rd  r   r   rF   rF   rF   rG   rm    s   

rm  c                   @   s   e Zd Zedd ZdS )r:   c                 C   s   t t| S rI   )r:   r  )re  rF   rF   rG   r     s    zTensorBox.createN)r   r   r   r   r   rF   rF   rF   rG   r:     s   c                   @   sL   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zedd Z	edd Z
dS )r  c                 C   s&   t | jttfr"| j tjjkS dS r   )r4   re  r|  r  r  r1   r   Zgraph_inputsr   rF   rF   rG   is_input_buffer  s    zStorageBox.is_input_bufferc                 C   s   t | jtttttfr | j S t | jtt	fs>t
t| j| j }| j }td t| j | j | j d| jd| _tj| j| j_| j| j_|| j_|| j_| jjS )Nrb  rc  )r4   re  rk  r  r|  r  r  r  r  r9  r@   rA   r   r   r   r   r   r   r1   r   r  rN   r   r   r   )r   r   r   rF   rF   rG   r     s6    



	
zStorageBox.realizec                 C   s0   t | jttfr,|  dkr,|  r,|   dS )zL
        Called on buffers we expect to be forced to realize later.
        r   N)r4   re  r  r9  r  8is_pointwise_non_scalar_tensor_num_reads_larger_than_oner   r   rF   rF   rG   r  0  s    
zStorageBox.realize_hintc                 C   s(   t | jto&|  tjkp&|  tjkS rI   )r4   re  r  r  r   Zrealize_acc_reads_thresholdr   realize_bytes_thresholdr   rF   rF   rG   r  ;  s    z!StorageBox.has_exceeded_max_readsc                 C   sn   t ttf ddd}|dkrjt| jttfrj|  tjksbt| 	 tj
ksbt| jrj|| jrj|   dS )zj
        A heuristic to decide if we should realize a tensor
        that is used multiple times.
        )loopsc                    s$   dg}|    t fdd|D S )zW
            The heuristic for realizing reused result of heavy ops on cpu
            expc                 3   s   | ]}|d   kV  qdS )r  NrF   )rS   opZfn_strrF   rG   rj  M  s     zGStorageBox.mark_reuse.<locals>.should_realize_on_cpu.<locals>.<genexpr>)r   r  )r  Z	heavy_opsrF   r  rG   should_realize_on_cpuG  s    z4StorageBox.mark_reuse.<locals>.should_realize_on_cpur   N)r   r  r9  r4   re  r  r   Zrealize_reads_thresholdrY   r   r  r   r   )r   r  r  rF   rF   rG   r  A  s    		zStorageBox.mark_reusec                 C   sz   | j }t|tttfrdS t|tr.| }nBt|ttfsHt	t
|td t| | | d|d }t|jS )Nr   rb  rc  )re  r4   r  r|  r  rk  rl  r  r9  r@   rA   r   r   r   r   rY   r   )r   re  rt  rF   rF   rG   r  Z  s     


	zStorageBox.num_readsc                 C   s,   t | jtr(tdd | j D dkS dS )Nc                 s   s   | ]}|j d kV  qdS )r   Nr[   )rS   readrF   rF   rG   rj  r  s     zVStorageBox.is_pointwise_non_scalar_tensor_num_reads_larger_than_one.<locals>.<genexpr>r   T)r4   re  r  r#  r   r   rF   rF   rG   r  n  s    
zCStorageBox.is_pointwise_non_scalar_tensor_num_reads_larger_than_oneN)r   r   r   r  r   r  r  r  r%   r  r  rF   rF   rF   rG   r    s   
r  c                       sV   e Zd Zeeddd Z fddZej	j
ed fddZ fd	d
Z  ZS )InterpreterShimNc                   C   s   t jtS rI   )r7   r   Zsymbolic_tracer   rF   rF   rF   rG   	_dummy_gmy  s    zInterpreterShim._dummy_gmc                    s>   t  j|  dd | | _|| _|| _d| _|j| _d | _	d S )NF)Zgarbage_collect_values)
r   r   r  moduler   
submodulesZextra_tracebackr`  Z
fetch_attrcurrent_noder   r   r  r   rF   rG   r   ~  s    zInterpreterShim.__init__)r   r  c                    s   || _ t |S rI   )r  r   run_node)r   r   r   rF   rG   r    s    zInterpreterShim.run_nodec              
      s0   t |  t j||W  5 Q R  S Q R X d S rI   )r1   Zset_interpreter_handlerr   run)r   rK   rL   r   rF   rG   r    s    zInterpreterShim.run)r   r   r   r   r  	lru_cacher  r   r7   r   r   r   r  r  r   rF   rF   r   rG   r  x  s   r  c                       sx   e Zd ZdZ fddZedd Zedd Zdd	 Ze	j
d
ddZdd Zdd Zdd Zdd Zdd Z  ZS )r  z
    Captures the body of a Loops subclass into an FX graph.  Persists any
    indexing simplifications and makes it easier to analyze loop bodies.
    c                    sj   t    || _i | _i | _g | _g | _i | _i | _g | _	d| j
i| _i | _g | _t| ||| _d | _d S )N	get_index)r   r   r  r  indexing_exprs_namer   writesr  r  rX  r  r  	subblocksindirect_varsLoopBodyBlock
root_blockindexing)r   rO   rK   r  r   rF   rG   r     s    
zLoopBody.__init__c                 C   s0   t | jjfdd | j D }dd |D S )Nc                 s   s   | ]}|j V  qd S rI   )r   )rS   blockrF   rF   rG   rj    s     z%LoopBody.get_nodes.<locals>.<genexpr>c                 S   s   g | ]}|j D ]}|qqS rF   )rB   )rS   r   rC   rF   rF   rG   rW     s       z&LoopBody.get_nodes.<locals>.<listcomp>)r  chainr  r   r  r  )r   Z
all_graphsrF   rF   rG   	get_nodes  s
    zLoopBody.get_nodesc                 C   s   ddl m} || S )Nr   )	BoundVars)boundsr  )r   r  rF   rF   rG   r    s    zLoopBody.boundsc                 C   s`   dt | j g}|dd | j D  |dd td| jfg| j D  d	|S )Nzvar_ranges = c                 S   s   g | ]\}}| d | qS )rG  rF   )rS   rN   r  rF   rF   rG   rW     s     z&LoopBody.debug_str.<locals>.<listcomp>c                 S   s   g | ]\}}| |qS rF   )	debug_str)rS   rN   r  rF   rF   rG   rW     s   r  r  )
r^   r  r  r  r  r  r  r  r  r   r   rF   rF   rG   r    s    
 zLoopBody.debug_str)r  c                 C   sd   t | || |d k	r,|t | | d|< || jkrZdt| j }|| j|< || j|< | j| S )NZ
_name2exprrU   )rJ   rp  r  rY   r  )r   r  categorybuf_namerN   rF   rF   rG   add_index_expr  s    


zLoopBody.add_index_exprc                 C   s<   |d   r|| jkr|}n| t| j }|| j|< |S )zaNot actually for nn.Modules, but subblocks in generated code are mapped to FX call_module opcodesr  )	isnumericr  rY   )r   r  r3   rN   rF   rF   rG   add_submodule  s
    
zLoopBody.add_submodulec                 C   s(   dt | j }t|}| j| |S )NZindirect)rY   r  r.   rp  )r   r}   rN   r+  rF   rF   rG   add_indirect  s    zLoopBody.add_indirectc                    sB   t t  krdS | jdk	s"t fdd| j D | _dS )z,Swap in a variable used in indirect indexingNc                    s    i | ]\}}|t | iqS rF   r&  r  newr   rF   rG   rn     s      z-LoopBody.replace_indirect.<locals>.<dictcomp>)rP   r  r@   r  )r   r   r  rF   r  rG   replace_indirect  s    zLoopBody.replace_indirectc                 C   s   | j d k	st| j | S rI   )r  r@   r   rF   rF   rG   r    s    zLoopBody.get_indexc                    s   t tj| }t|tjks.t|jftfdd|D sHtttj	 |  fddj
 D _ }d _|S )Nc                 3   s   | ]}| j kV  qd S rI   )r  r  r   rF   rG   rj    s     z$LoopBody.__call__.<locals>.<genexpr>c                    s   i | ]\}}|t | qS rF   r&  )rS   rN   r  r'  rF   rG   rn     s    z%LoopBody.__call__.<locals>.<dictcomp>)r5   r  r  rY   r  r@   ro  r^   r_   r|  r  r  r  r  )r   ru  rU   r\  rF   )r(  r   rG   __call__  s     
zLoopBody.__call__)r   r   r   r:  r   r%   r  r  r  r;   r   r  r  r  r  r  r  r   rF   rF   r   rG   r    s   


	r  c                   @   s@   e Zd ZdZeedef ee dddZdd Z	dd	d
Z
dS )r  a  
    Captures the body of a Loops subclass into an FX graph.
    In normal cases there will be a 1:1 mapping between LoopBody and
    LoopBodyBlock, hower in the case of ops.masked() the masked out
    operations will manifest as an extra LoopBodyBlock.
    .)r  rO   rK   c           	   	      s   |_ dfdd	 G  fdddtj}tj tjjjd_	dddi }d	d
l
m} d	dlm} |||j j}tjr||}t| t||  W 5 Q R X j_d S )Nc              	      s    dd j| ||fi S )Ncall_moduler  )create_proxyr  r  )r  r  r  r   tracerrF   rG   	add_index  s    z)LoopBodyBlock.__init__.<locals>.add_indexc                       s   e Zd Zd_eejd fddZd fdd	Z fdd	Z	d
d Z
 fddZeejejed fddZeedef dfddZedfdd	ZefddZdS )z/LoopBodyBlock.__init__.<locals>.CaptureIndexingCaptureIndexing)rN   rU   c                    s    |d|}| j ||S )Nr   )_innerr2  )r   rN   rU   r  rF   rG   r2    s    z4LoopBodyBlock.__init__.<locals>.CaptureIndexing.loadNc                    s    |d|}| j ||||S Nr  )r  r  )r   rN   rU   r   r  r  rF   rG   r    s    z5LoopBodyBlock.__init__.<locals>.CaptureIndexing.storec                    s    |d|}| j |||S r  )r  r?  )r   rN   rU   r   r  rF   rG   r?    s    z?LoopBodyBlock.__init__.<locals>.CaptureIndexing.store_reductionc                    s8   | j |||| d|kr4t fddtdD S  S )Nr  c                 3   s   | ]} | V  qd S rI   rF   rR   r\  rF   rG   rj    s     zLLoopBodyBlock.__init__.<locals>.CaptureIndexing.reduction.<locals>.<genexpr>ri   )r  r>  r6   rZ   )r   r   r;  r3  r   rF   r  rG   r>    s    z9LoopBodyBlock.__init__.<locals>.CaptureIndexing.reductionc                    s:   t |ttjfr"| jt||S  |d}| j||S NrX  )r4   rC  r;   r   r  r   r  )r   rU   r   r  rF   rG   r    s    
z:LoopBodyBlock.__init__.<locals>.CaptureIndexing.index_expr)offsets_nameoffsets_sizeindexing_dtyperN  c                    s    |d}| j |||||S r  )r  	bucketize)r   r  r  r  r  rN  r  rF   rG   r  $  s    
    z9LoopBodyBlock.__init__.<locals>.CaptureIndexing.bucketize.)masked_bodyc                    sH    fdd}j |d}tj |g   j j|< d|| |fi S )zb
                Recursively capture the masked out body in another LoopBodyBlock
                c                    s   t j|  |S rI   )r1   r0   r  )r0  rX  ZsubblockrF   rG   shim7  s    zDLoopBodyBlock.__init__.<locals>.CaptureIndexing.masked.<locals>.shimZmasked_subblockr  )r  r  r  r  r  )Z
mask_proxyr  Zother_proxyr  rN   r  r  rG   r  1  s       z6LoopBodyBlock.__init__.<locals>.CaptureIndexing.maskedTc                    sD    fdd}j dj |d | fi  S )z
                Flow data from tensors into indexing formulas.
                Introduce a call_module to update the indexing.
                c                    s   j tj|   d S rI   )r  r  r1   r0   indirect_indexing)Znew_var)checkr   r}   r+  rF   rG   set_indirectH  s     zWLoopBodyBlock.__init__.<locals>.CaptureIndexing.indirect_indexing.<locals>.set_indirectr  set_)r  r  r  r  )Zindex_proxyr}   r  r  r  )r  r}   r+  rG   r  A  s    zALoopBodyBlock.__init__.<locals>.CaptureIndexing.indirect_indexingc                    s     dd| fi  d S )Nr>  )r  r  )r  rF   rG   r>  V  s    z6LoopBodyBlock.__init__.<locals>.CaptureIndexing.output)N)T)r   r   r   rN   rP   r;   r   r2  r  r?  r>  r  r7   r   r  r  r   r   r   r  r  r>  rF   r  r   r  rF   rG   r  	  s"   	r  )Z
tracer_clsplaceholderr0   rF   r   )IndexPropagation)SimplifyIndexing)N)r  r1   ZWrapperHandlerr7   r   ZTracerZGraphr   r   r  Zindex_propagationr  r   r  r  r   Zconstant_and_index_propagationZset_ops_handlerr0   r>  )	r   r  rO   rK   r  Z	proxy_opsr  r  handlerrF   r  rG   r     s    Q
zLoopBodyBlock.__init__c                 C   s"   | j }| jj}t||t S rI   )r   r  r  r  r  r1   Zget_ops_handlerr  rF   rF   rG   r  k  s    zLoopBodyBlock.__call__r  c              
   C   s8   t j| jj| jj}tdd|	 
dd| dS )Nz;[^\n]*r   zdef forward(zdef r  )r7   r   ZGraphModuler  r  r   coderesubstripr.  )r   rN   r  rF   rF   rG   r  q  s    zLoopBodyBlock.debug_strN)r  )r   r   r   r:  r  r   r   r
   r   r  r  rF   rF   rF   rG   r    s   mr  c                       sL   e Zd ZdZd fdd	Zdd Zdd Zed	d
ddZdd Z	  Z
S )Waitz
    Wait should not be used by itself.  It should always be constructed in tandem
    with a collective op that produces a work to wait on.
    rF   c                    s   t  ||| d S rI   r   r   rx  r   rF   rG   r     s    zWait.__init__c                 C   s   dS r   rF   r   rF   rF   rG   r    s    zWait.should_allocatec                 C   sN   | d dd | jD \}|| d| d ||   d|  d S )NzGfrom torch.distributed._functional_collectives_impl import _wait_tensorc                 s   s   | ]}|  V  qd S rI   r  r  rF   rF   rG   rj    s     zWait.codegen.<locals>.<genexpr>z = _wait_tensor(r6  rG  )add_import_oncer  r  r  )r   r  Zinput_collectiverF   rF   rG   r    s    zWait.codegenr:   )collective_opc                 C   s   |   t| |gdS )N)rd  r  )rr  r  r   )r   r  rF   rF   rG   r     s
    zWait.createc                 C   s   | j d  gS r   )r  r9  r   rF   rF   rG   rz    s    zWait.get_alias_names)rF   )r   r   r   r:  r   r  r  r   r   rz  r   rF   rF   r   rG   r  {  s   	 r  c                       sD   e Zd ZdZ fddZdd Zdd Zedd	 Zd
d Z	  Z
S )CollectiveKernela  
    Each collective should follow the pattern:
    - extend InPlaceCollectiveKernel or OutOfPlaceCollectiveKernel.
    - the kernel delegates into c10d processgroup, which returns a 'work' obj
    - the work obj is registered via _register_tensor_work so it can be waited on later
    c                    s$   t  d ||| tj| | _d S rI   r   r   r1   r   r  rN   rx  r   rF   rG   r     s    zCollectiveKernel.__init__c                 C   s   t dd S NzMust implementr  r   r  r  input_namesrF   rF   rG   codegen_collective  s    z#CollectiveKernel.codegen_collectivec                 C   s   t dd S r  r  r  rF   rF   rG   codegen_output  s    zCollectiveKernel.codegen_outputc                 C   s   dd }t t||S )Nc                 S   s(   t t|  |  |  | }t|S rI   )InPlaceHintr   r   r   r   r:   r   )r+  r  rF   rF   rG   
wrap_input  s
     z;CollectiveKernel.wrap_inputs_as_inplace.<locals>.wrap_input)r5   r   )r   r  r  rF   rF   rG   wrap_inputs_as_inplace  s    z'CollectiveKernel.wrap_inputs_as_inplacec              
   C   s   | d | d | d dd | jD }|  }| j\}}}|| d| d| d| d	 | ||| | ||| |d
| d| d d S )Nz import torch.distributed as distz1import torch.distributed.distributed_c10d as c10dzEimport torch.distributed._functional_collectives_impl as fun_col_implc                 S   s   g | ]}|  qS rF   r  r  rF   rF   rG   rW     s     z,CollectiveKernel.codegen.<locals>.<listcomp>z0_pg = c10d._find_or_create_pg_by_ranks_and_tag('rJ  r  r6  z#fun_col_impl._register_tensor_work(z_work))r  r  r  r  r  r	  r  )r   r  r  r  tagranks
group_sizerF   rF   rG   r    s     

zCollectiveKernel.codegen)r   r   r   r:  r   r  r	  r   r  r  r   rF   rF   r   rG   r    s   
	r  c                       s8   e Zd ZdZ fddZdd Zdd Zdd	 Z  ZS )
InPlaceCollectiveKernelz
    InPlaceCollectiveKernel are those with in-out arguments such as all_reduce.
    Extend this kernel if your collective needs to modify its inputs in-place.
    c                    s   t  ||| d S rI   r   rx  r   rF   rG   r     s    z InPlaceCollectiveKernel.__init__c                 C   s   dS r   rF   r   rF   rF   rG   r    s    z'InPlaceCollectiveKernel.should_allocatec                 C   s   dS r  rF   r   rF   rF   rG   has_side_effects  s    z(InPlaceCollectiveKernel.has_side_effectsc                 C   sF   t |dkr*|| dd| d n|| d|d   d S )Nr    = [,z] rG  r   )rY   r  r   r  rF   rF   rG   r	    s    z&InPlaceCollectiveKernel.codegen_output)	r   r   r   r:  r   r  r  r	  r   rF   rF   r   rG   r    s
   r  c                       sR   e Zd ZdZ fddZdd Zdd Zdd	 ZedddZ	edd Z
  ZS )OutOfPlaceCollectiveKernelz
    OutOfPlaceCollectiveKernel are those that allocate their
    outputs and leave their inputs inplace, such as all_gather.
    c                    s$   t  ||| | || _|| _d S rI   )r   r   outputsoriginal_inputsr   rd  r  r  r  r   rF   rG   r     s    z#OutOfPlaceCollectiveKernel.__init__c                 C   s   dS r   rF   r   rF   rF   rG   r    s    z*OutOfPlaceCollectiveKernel.should_allocatec                 C   s   dS r  rF   r   rF   rF   rG   r     s    z+OutOfPlaceCollectiveKernel.has_side_effectsc                 C   sX   dd | j D }|| dd| d || dddd | jD  d d S )	Nc                 S   s   g | ]}|  qS rF   r  r  rF   rF   rG   rW     s     z=OutOfPlaceCollectiveKernel.codegen_output.<locals>.<listcomp>z_inputs = [r  r  r  c                 s   s   | ]}|j V  qd S rI   rM   r  rF   rF   rG   rj    s     z<OutOfPlaceCollectiveKernel.codegen_output.<locals>.<genexpr>)r  r  r   r  r  rF   rF   rG   r	    s    z)OutOfPlaceCollectiveKernel.codegen_outputNc                 C   sP   g }|D ]B}|  }|d k	r$|| tt| | |dd}|| q|S )Nrb  r5  )r   OutputBufferr   r   r   rp  )r   r  Zsize_cbr  r  r  ZbuffrF   rF   rG   create_output_buffers  s    z0OutOfPlaceCollectiveKernel.create_output_buffersc                    s    fddt |D S )Nc                    s&   g | ]\}}t |j d | dqS )r  r  )MultiOutputNoSizeAssertrd  )rS   rT   Zout_tcollrF   rG   rW     s   
zBOutOfPlaceCollectiveKernel.create_output_nodes.<locals>.<listcomp>r   )r   r  Zoutput_buffersrF   r  rG   create_output_nodes  s    
z.OutOfPlaceCollectiveKernel.create_output_nodes)N)r   r   r   r:  r   r  r  r	  r   r  r  r   rF   rF   r   rG   r    s   r  c                       s0   e Zd ZdZdd Z fddZdd Z  ZS )r
  a  
    Helper OP to encode an in/out argument that tries to make it inplace whenever possible.
    Wrap the input of your inplace op to enable this behavior.

    The design is based on two key decisions:
    - this node is resposible for allocating the in/out buffer used by the collective.
        This is controlled by the ``should_allocate`` method that returns True here and
        False for the collective node
    - The scheduler special-case this node and enable it to reuse its input.
    c                 C   sB   | j d  }|  }|| | j d s>|| d| d d S )Nr   r"  z) #no reuse)r  r9  r  Z	did_reuser  )r   r  Z
input_namer  rF   rF   rG   r  3  s    zInPlaceHint.codegenc                    s6   |  |}t d || |gd tj| | _d S rB  )r  r   r   r  r1   r   r  rN   )r   rd  r  r   rF   rG   r   9  s    
zInPlaceHint.__init__c                 C   s   dS r  rF   r   rF   rF   rG   r  >  s    zInPlaceHint.should_allocate)r   r   r   r:  r  r   r  r   rF   rF   r   rG   r
  '  s   r
  c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )r  zO
    Represent the output buffer used by ops that require multiple of them
    c                    s$   t  jd |g d tj| | _d S )Nr  r  )r   rd  r   rF   rG   r   G  s    zOutputBuffer.__init__c                 C   s   dS r  rF   r   rF   rF   rG   r  K  s    zOutputBuffer.should_allocatec                 C   s   | d| j  d S )Nz# collective out buffer )r  rN   r  rF   rF   rG   r  N  s    zOutputBuffer.codegen)r   r   r   r:  r   r  r  r   rF   rF   r   rG   r  B  s   r  c                       s(   e Zd ZdZ fddZdd Z  ZS )r  z
    Extract partial output from a multi-output OP.
    Works like MultiOutput but doesn't assert size. This must be a property guaranteed by the op emiting this.
    c                    s   t  ||g  || _d S rI   )r   r   rU   )r   rd  r  rU   r   rF   rG   r   X  s    z MultiOutputNoSizeAssert.__init__c                 C   s,   | |   d| jd   | j  d S rF  )r  r  r  rU   r  rF   rF   rG   r  \  s     zMultiOutputNoSizeAssert.codegen)r   r   r   r:  r   r  r   rF   rF   r   rG   r  R  s   r  c                       sN   e Zd Z fddZdd Zeed eeee edddZ	d	d
 Z
  ZS )AllReduceCoalescedc                    s   t  ||| || _d S rI   r   r   	reduce_opr   rd  r  r  r   r   rF   rG   r   c  s    zAllReduceCoalesced.__init__c                 C   s   dS r   rF   r   rF   rF   rG   r  g  s    z"AllReduceCoalesced.should_allocater:   r  r   r  r  r  c           	      C   s4   |  |}t|d  }t|||||g|d}|S Nr   )rd  r  r  r   )r  rv  r   r  )	r   r  r   r  r  r  inplace_inputsrd  rv   rF   rF   rG   r   j  s    	
zAllReduceCoalesced.createc              
   C   s,   | | d| dt| j d| d d S )Nz"_work = dist.all_reduce_coalesced(z%, op=fun_col_impl._str_to_reduce_op('
'), group=_pg, async_op=True)r  rP   r   r  rF   rF   rG   r  ~  s     z%AllReduceCoalesced.codegen_collective)r   r   r   r   r  r   r
   rP   rC  r   r  r   rF   rF   r   rG   r  b  s   r  c                       sB   e Zd Z fddZedeeee edddZdd Z	  Z
S )		AllReducec                    s   t  ||| || _d S rI   r  r!  r   rF   rG   r     s    zAllReduce.__init__r:   r   r   r  r  r  c           	      C   s:   |  |g}t|d  }t|||||g|d}|d S r#  )r  rv  r   r(  )	r   r   r   r  r  r  r$  rd  rv   rF   rF   rG   r     s    zAllReduce.createc              
   C   s,   | | d| d| dt| j d d S )Nz_work = dist.all_reduce(z, async_op=True, group=(_pg, op=fun_col_impl._str_to_reduce_op(''))r'  r  rF   rF   rG   r    s     zAllReduce.codegen_collectiver   r   r   r   r   rP   r
   rC  r   r  r   rF   rF   r   rG   r(    s       r(  c                       s@   e Zd Z fddZedeee edddZdd Z	  Z
S )	AllGatherIntoTensorc                    s   t  |||| d S rI   r   r  r   rF   rG   r     s    zAllGatherIntoTensor.__init__r:   )r   r  r  r  c           
         sZ   |  |g} fdd}| ||}t|d  }t||||| gd}	| |	|d S )Nc                    s   | d   9  < d S r   rF   r  r  rF   rG   compute_size  s    z0AllGatherIntoTensor.create.<locals>.compute_sizer   rd  r  r  r  )r  r  rv  r   r-  r  )
r   r   r  r  r  r  r0  r  rd  rA  rF   r/  rG   r     s    zAllGatherIntoTensor.createc              
   C   s&   | | d| d| d| d d S )Nz$_work = dist.all_gather_into_tensor([0], !_inputs[0], async_op=True, group=z_pg)r  r  rF   rF   rG   r    s    z&AllGatherIntoTensor.codegen_collectiver,  rF   rF   r   rG   r-    s   r-  c                       sB   e Zd Z fddZedeeee edddZdd Z	  Z
S )	ReduceScatterTensorc                    s   t  |||| || _d S rI   r  r   rd  r  r  r  r   r   rF   rG   r     s    zReduceScatterTensor.__init__r:   r)  c                    s\   |  |g} fdd}| ||}t|d  }	t|	|||| g|d}
| |
|d S )Nc                    s   | d     < d S r   rF   r.  r/  rF   rG   r0    s    z0ReduceScatterTensor.create.<locals>.compute_sizer   rd  r  r  r  r   )r  r  rv  r   r5  r  )r   r   r   r  r  r  r  r0  r  rd  rA  rF   r/  rG   r     s    	zReduceScatterTensor.createc                 C   s2   | | d| d| d| dt| j d
 d S )Nz#_work = dist.reduce_scatter_tensor(r2  r3  r*  r+  r'  r  rF   rF   rG   r    s    &z&ReduceScatterTensor.codegen_collectiver,  rF   rF   r   rG   r5    s   r5  c                       sD   e Zd Z fddZeed eee edddZdd Z	  Z
S )	AllGatherIntoTensorCoalescedc                    s   t  |||| d S rI   r   r  r   rF   rG   r     s    z%AllGatherIntoTensorCoalesced.__init__r:   )r  r  r  r  c           	         sT    fdd|D }fdd}  ||}t|d  }t|||||gd}|S )Nc                    s   g | ]}  |qS rF   r  r  r  rF   rG   rW     s     z7AllGatherIntoTensorCoalesced.create.<locals>.<listcomp>c                    s   | d   9  < d S r   rF   r.  r/  rF   rG   r0    s    z9AllGatherIntoTensorCoalesced.create.<locals>.compute_sizer   r1  )r  rv  r   r8  )	r   r  r  r  r  r0  r  rd  rA  rF   r   r  rG   r     s    z#AllGatherIntoTensorCoalesced.createc              
   C   s&   | | d| d| d| d d S )NzO_work = fun_col_impl._all_gather_into_tensor_coalesced_fallback(output_tensors=, input_tensors=z_inputs, group=r&  r4  r  rF   rF   rG   r    s    z/AllGatherIntoTensorCoalesced.codegen_collectiver   r   r   r   r   r
   rP   rC  r   r  r   rF   rF   r   rG   r8    s   r8  c                       sF   e Zd Z fddZeed eeee edddZdd Z	  Z
S )	ReduceScatterTensorCoalescedc                    s   t  |||| || _d S rI   r  r6  r   rF   rG   r     s    z%ReduceScatterTensorCoalesced.__init__r:   r"  c           
         sV    fdd|D }fdd}  ||}t|d  }t|||||g|d}	|S )Nc                    s   g | ]}  |qS rF   r  r  r  rF   rG   rW      s     z7ReduceScatterTensorCoalesced.create.<locals>.<listcomp>c                    s   | d     < d S r   rF   r.  r/  rF   rG   r0  "  s    z9ReduceScatterTensorCoalesced.create.<locals>.compute_sizer   r7  )r  rv  r   r<  )
r   r  r   r  r  r  r0  r  rd  rv   rF   r9  rG   r     s    	z#ReduceScatterTensorCoalesced.createc                 C   s2   | | d| d| dt| j d| d
 d S )NzN_work = fun_col_impl._reduce_scatter_tensor_coalesced_fallback(output_tensors=r:  z,_inputs, op=fun_col_impl._str_to_reduce_op('r%  r&  r'  r  rF   rF   rG   r  3  s    &z/ReduceScatterTensorCoalesced.codegen_collectiver;  rF   rF   r   rG   r<    s   r<  )T)TFN)FN)r  r   r  r  r  loggingr  textwrapr   r   enumr   r   inspectr   typingr   r   r   r	   r
   r   r   r   r   r   Zunittest.mockr   r;   r   r   Ztorch._loggingr7   Ztorch.fxZtorch.utils._pytreeutilsZ_pytreer  Ztorch._dynamo.utilsr   Ztorch._prims_commonr   r   r   r   r   Ztorch.fx.operator_schemasr   Ztorch.utils._sympy.functionsr   r   r   r   r   r   Zcodegen.commonr    Zcuda_propertiesr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   Zvirtualizedr0   r1   	getLoggerr   r  r   r+  rH   rQ   rb   rd   rh   r  rs   rC  ry   r   r   r   r   r   r   r   r  r   r   r   r   r8  r   r   r   r  r  r  r  r7  r8  r9  r  r  r   r  r  r  r  r  r8   r  r
  r  r  r  r;  rA  rC  rE  r-  r  r   rf  ri  r  r|  r  r~  r  rk  r  r  r  r  r  r
  r  r  r  r  r  r!  r9   r$  rv  r?  r  rY  r[  r\  rk  rt  rw  r|  r  r  r  r  r  r  rm  r:   r  r   ZInterpreterr  r  r  r  r  r  r  r
  r  r  r  r(  r-  r5  r8  r<  rF   rF   rF   rG   <module>   s  08
$	

@O
2    5 }
 J:): @1m#[Hh s31g  j$"V$% e*   8H[_5@EVn 	 #o(ld ,55&()