U
    9%e                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	m
Z
mZmZ d dlZd dlZd dlmZ ddlmZmZmZmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' e(e)Z*dd Z+G dd dZ,dddddZ-ej.j/j0ej.j/j1ej.j/j2ej.j/j3dZ4G dd dZ5G dd de5Z6G dd de5Z7G dd de5Z8G dd de5Z9G dd de9Z:d)d!d"Z;ej<G d#d$ d$Z=G d%d& d&Z>G d'd( d(Z?dS )*    N)DictListOptionalSet)dynamo_timed   )configdependenciesirmetrics)get_scheduling_for_device)StarDepWeakDep)ComputedBuffer)SimplifyIndexing)cache_on_selfcmpfree_symbol_hasget_device_tflopsget_dtype_sizeget_gpu_dram_gbps
has_tritonsympy_product)Vc                 C   sB   t | trt| td} tj| dd}d|kr>dt|d S |S )Nkey   )indent
    )
isinstancesetsortedstrpprintpformattextwrapr   )objresult r)   X/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/torch/_inductor/scheduler.pyr%   %   s    
r%   c                   @   s0   e Zd Zdd Zdd Zdd Zdd ZeZd	S )

OutputNodec                 C   s   |h| _ g | _d S N)unmet_dependenciesinverse_usersselfdepr)   r)   r*   __init__0   s    zOutputNode.__init__c                 C   s   dS NFr)   r0   r)   r)   r*   is_reduction4   s    zOutputNode.is_reductionc                 C   s   dS )Nr)   r)   r4   r)   r)   r*   get_alias_names7   s    zOutputNode.get_alias_namesc                 C   s   dS )NZOUTPUTr)   r4   r)   r)   r*   get_name:   s    zOutputNode.get_nameN)__name__
__module____qualname__r2   r5   r6   r7   __repr__r)   r)   r)   r*   r+   /   s
   r+   BaseSchedulerNodenode1node2c                 C   s,   |   s|  rt| |S t| |S d S r,   )
is_foreachForeachKernelSchedulerNodefuseFusedSchedulerNoder=   r)   r)   r*   rB   @   s    rB   )zextern_kernels.convolutionzextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmc                   @   s  e Zd ZdejdddZdd Zeddd	Zedd
dZ	dd Z
eeef dddZdd Zed dddZee eeef dddZdd Zdd Zdd Zejd d!d"Zd#d$ Zee dd%d&Zee dd'd(Zd)d* Zd+d, Zd-d. Zedd/d0Zedd1d2Zee dd3d4Z ed  dd5d6Z!d7d8 Z"d9d: Z#d;d< Z$d=d> Z%d?d@ Z&ej'dAdBdCZ(dDdE Z)dFdG Z*dHdI Z+dRdKdLZ,e-ddMdNZ.e/ddOdPZ0dQS )Sr<   	Scheduler	schedulernodec                 C   sH   || _ || _d | _g | _| |  d | _d | _d | _d | _	d| _
d S r3   )rF   rG   usersr.   set_read_writesZget_read_writesrecursive_predecessors	min_order	max_order
last_usagewritten)r0   rF   rG   r)   r)   r*   r2   Q   s    zBaseSchedulerNode.__init__c                 C   s   t | j d|  dS )Nz(name=)typer8   r7   r4   r)   r)   r*   r;   ]   s    zBaseSchedulerNode.__repr__returnc                 C   s   |   }| dt| j dt| jj d| dt| jj | dt| j | dt| jj| j  | d| j	 g}z|| 
 g7 }W n" tk
r   tjdd	d
 Y nX d| S )z#Longer form printout for trace logsz: (rO   z
.writes = z.unmet_dependencies = z.met_dependencies = z	.users = zIgnoring error in debug_str()T)exc_infor   )r7   rQ   r8   rG   r%   read_writeswritesr-   readsrH   debug_str_extra	Exceptionlogwarningjoinrstripr0   namelinesr)   r)   r*   	debug_str`   s    "
zBaseSchedulerNode.debug_strc                 C   s   dS )N r)   r4   r)   r)   r*   rY   s   s    z!BaseSchedulerNode.debug_str_extrac                 C   s   t d| | j| jj d S )Nz(%s: unmet_dependencies = %s, writes = %s)r[   infor-   rV   rW   r4   r)   r)   r*   log_detailsv   s    zBaseSchedulerNode.log_detailsrenamesc                 C   s   |  | j| d S r,   )rI   rV   renamer0   rg   r)   r)   r*   update_mutated_names~   s    z&BaseSchedulerNode.update_mutated_namesc                 C   s   |  | j| d S r,   )rI   rV   Z	with_readr/   r)   r)   r*   add_mutation_dep   s    z"BaseSchedulerNode.add_mutation_depNodeUserrH   c                 C   s\   i }|D ]@}t |j|kr:||t |j |t |j< q||t |j< qt| | _d S r,   )idrG   mergelistvaluesrH   )r0   rH   r(   user)   r)   r*   	set_users   s     zBaseSchedulerNode.set_usersfuture_used_buffersmutation_real_namec                    s(   |   } fdd|D }|| | _d S )Nc                    s   h | ]}  ||qS r)   )get).0krv   r)   r*   	<setcomp>   s     z3BaseSchedulerNode.set_last_usage.<locals>.<setcomp>)used_or_aliased_buffer_namesrM   )r0   ru   rv   Zused_buffersr)   rz   r*   set_last_usage   s    z BaseSchedulerNode.set_last_usagec                 C   s
   | j  S r,   )rG   r6   r4   r)   r)   r*   get_aliases   s    zBaseSchedulerNode.get_aliasesc                 C   s
   | j  S r,   )rG   get_mutation_namesr4   r)   r)   r*   get_mutations   s    zBaseSchedulerNode.get_mutationsc                 C   s   t |  p|  S r,   )boolr~   r   r4   r)   r)   r*   has_aliasing_or_mutation   s    z*BaseSchedulerNode.has_aliasing_or_mutation)rwc                 C   s   || _ | j j| _|   d S r,   )rV   rX   r-   
prune_deps)r0   r   r)   r)   r*   rI      s    
z!BaseSchedulerNode.set_read_writesc                 C   s   | j jS r,   )rV   	op_countsr4   r)   r)   r*   r      s    zBaseSchedulerNode.op_countsc                 C   s   dd t | jj| jjD S )Nc                 S   s   h | ]
}|j qS r)   r`   rx   r1   r)   r)   r*   r{      s   z6BaseSchedulerNode.used_buffer_names.<locals>.<setcomp>)	itertoolschainrV   rX   rW   r4   r)   r)   r*   used_buffer_names   s    z#BaseSchedulerNode.used_buffer_namesc                 C   sp   t  }t| jj| jjD ]P}||j tj	j
|jrtj	j
|j  }t|tjr||jj  q|S r,   )r!   r   r   rV   rX   rW   addr`   r   graphname_to_bufferrw   
get_layoutr    r
   AliasedLayoutviewdatar7   )r0   Z
used_namesr1   layoutr)   r)   r*   r|      s    z.BaseSchedulerNode.used_or_aliased_buffer_namesc                    s    fdd j D  _ d S )Nc                    s   h | ]}|j  jjkr|qS r)   )r`   rF   available_buffer_namesr   r4   r)   r*   r{      s   z/BaseSchedulerNode.prune_deps.<locals>.<setcomp>r-   r4   r)   r4   r*   r      s    
zBaseSchedulerNode.prune_depsc                    s4   dd   fdd| j jD }| | j | d S )Nc                 S   s   t | to| jtjjkS r,   )r    r   r`   r   r   removed_buffers)r1   r)   r)   r*   should_prune   s    z7BaseSchedulerNode.prune_weak_deps.<locals>.should_prunec                    s   h | ]} |r|qS r)   r)   r   r   r)   r*   r{      s      z4BaseSchedulerNode.prune_weak_deps.<locals>.<setcomp>)rV   rX   rI   remove_reads)r0   Z	to_remover)   r   r*   prune_weak_deps   s    z!BaseSchedulerNode.prune_weak_depsc                    s~   t   jD ](}t|ts |j    d7  < q fddfddjD }j| _j	| dS )a  
        Prunes stardeps intended for mutation ordering
        on an upstream fused node if after fusion there is another dependency
        on the fused upstream node, making the stardep redundant

        In essence this enforces an ordering on fusions. As fusions occur, prunable stardeps will
        be incrementally removed, enabling other fusions, ensuring they are fused in order.
        r   c                    s>   t | tr6 | j   dk}| j k}|p4|S dS d S )Nr   F)r    r   r`   r7   )r1   Zis_redundantZis_self_dep)name_to_dep_countname_to_fused_noder0   r)   r*   r      s    
z<BaseSchedulerNode.prune_redundant_deps.<locals>.should_prunec                    s   h | ]} |r|qS r)   r)   r   r   r)   r*   r{      s      z9BaseSchedulerNode.prune_redundant_deps.<locals>.<setcomp>N)
collectionsCounterr-   r    r   r`   r7   rI   rV   r   )r0   r   r1   Zdeps_to_pruner)   )r   r   r0   r   r*   prune_redundant_deps   s    	

z&BaseSchedulerNode.prune_redundant_depsc                 C   s
   | j  S r,   rG   r7   r4   r)   r)   r*   r7      s    zBaseSchedulerNode.get_namec                 C   s   |   S r,   r7   r4   r)   r)   r*   get_first_name   s    z BaseSchedulerNode.get_first_namec                 C   s
   |   hS r,   r   r4   r)   r)   r*   	get_names   s    zBaseSchedulerNode.get_namesc                 C   s   | gS r,   r)   r4   r)   r)   r*   	get_nodes   s    zBaseSchedulerNode.get_nodesc                 C   s
   | j  S r,   )rG   
get_devicer4   r)   r)   r*   r      s    zBaseSchedulerNode.get_devicec                 C   s   dS r3   r)   r4   r)   r)   r*   r5      s    zBaseSchedulerNode.is_reductionc                 C   s   dS r3   r)   r4   r)   r)   r*   is_template   s    zBaseSchedulerNode.is_templatec                 C   s   dS r3   r)   r4   r)   r)   r*   	is_extern   s    zBaseSchedulerNode.is_externc                 C   s   dS r3   r)   r4   r)   r)   r*   r@      s    zBaseSchedulerNode.is_foreachread_depc                 C   s   dS r3   r)   r0   r   r)   r)   r*   can_inplace  s    zBaseSchedulerNode.can_inplacec                 C   s   dS r3   r)   r4   r)   r)   r*   has_side_effects  s    z"BaseSchedulerNode.has_side_effectsc                    s   j  sd S t tfrB j  s. j  rBtjj	 j  d S t tfspt t
rt j tjtjfrtjrttjtjjjjrttjdd d k	rddlm} t jjdd d}|D ]} jj|j}|rtjj | rć fdd|j!D }t"|dkr|d	 j#r|d	 j  krt|j $ tj%tj&tj'fs||j | j krtjj(|j  j  t)tjd
rtjj*+|,  ,  ttjtjjjjrtjj-.|,  tjj-. ,   j/0|,   d S qtjj	 j  d S )N	mutationsr   )buffer_reuse_keyc                 S   s   | j S r,   r   xr)   r)   r*   <lambda>#      z,BaseSchedulerNode.allocate.<locals>.<lambda>r   c                    s"   g | ]}|j   jjkr|qS r)   )rG   r7   rF   r   rx   r   r4   r)   r*   
<listcomp>*  s
   z.BaseSchedulerNode.allocate.<locals>.<listcomp>r   args)1rG   Zshould_allocater    SchedulerNoder6   r   r   r   wrapper_codeZcodegen_allocationExternKernelSchedulerNoder
   	AllReduceInPlaceHintr   inplace_bufferskerneltorch	_inductorcodegentritonZTritonKernelgetattrZcodegen.wrapperr   r"   rV   rX   rF   name_to_noderw   r`   Z	can_reuserH   lenr   r   ZMultiOutputLayoutZMutationLayoutr   Zcodegen_inplace_reusehasattrr   Zmake_inplacer7   r   r   rM   discard)r0   r   Zordered_readsreadZ
input_nodeZremaining_usesr)   r4   r*   allocate  s    

	


   
zBaseSchedulerNode.allocatec                 C   s"   | j D ]}t|jtr dS qdS )NFT)rH   r    rG   r+   )r0   rr   r)   r)   r*   can_freeW  s    
zBaseSchedulerNode.can_freeTc           	      C   s  t js
d S |r| jrd S | jj}g }|D ]}|jdkr8q(|d |d d|j d|j }d|jkr~|d|jd   }|| d|jkr(|jd  }|	d	d
 }|d|
dd
dd
dd  |d |d q(t|dkrd S || d| _d S )Noutputrc   z#pragma CMT ORIGIN:z#pragma CMT  Zseq_nrz seq_nr:stack_trace|{z{{}z}}r   \z#pragma CMT END ORIGINr   T)r   Zcomment_originrN   rG   originsopappendtargetmetasplitreplacer   
writelines)	r0   bufferZ	only_oncer   Z	out_linesoZop_info_strr   Zstack_trace_last_liner)   r)   r*   codegen_originating_info]  sF    






  

z*BaseSchedulerNode.codegen_originating_infoc                    s   t trdS dd jjD }dd jjD }fdd t trj fdd|D }|| }|| }d}||B D ]`}|tjjkrtjj| }n|tjj	krvtjj	| }nqv|tjj
t| t|  7 }qv|S )Nr   c                 S   s   h | ]
}|j qS r)   r   r   r)   r)   r*   r{     s     zABaseSchedulerNode.get_read_write_buffers_sizes.<locals>.<setcomp>c                 S   s   h | ]
}|j qS r)   r   r   r)   r)   r*   r{     s     c                    s.   dd  j j|  jD }t|t j dkS )Nc                 S   s   h | ]
}|j qS r)   )rG   )rx   userr)   r)   r*   r{     s     zZBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.is_materialized.<locals>.<setcomp>r   )rF   r   rH   r   r!   snodes)bufZbuf_usesr4   r)   r*   is_materialized  s    zGBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.is_materializedc                    s   h | ]} |s|qS r)   r)   r   )r   r)   r*   r{     s      )r    NopKernelSchedulerNoderV   rX   rW   rC   r   r   r   graph_inputssizevarsZ	size_hintr   get_sizer   	get_dtype)r0   rX   rW   r   Z
node_bytesr   r)   )r   r0   r*   get_read_write_buffers_sizes  s,    




z.BaseSchedulerNode.get_read_write_buffers_sizesc                    s  d }d }| j s:| jst| jd j  }| jd j  }n| j  }| j  }d|jjkr^dS zt }t|d }W n t	k
r   Y dS X t
| trftt| j ddd }|d k	rddlm} ddlm} |  |dd	r}d
dlm   fdd| j jD }	| j j}
|
j|f|	| j j d}| }|| | W  5 Q R  W  5 Q R  S Q R X W 5 Q R X n&t
| tst
| j tr|  | S dS )Nr   cudal    J)r   rc   )FakeTensorMode)FlopCounterModeF)displayr   ir_node_to_tensorc                    s   g | ]} |qS r)   r)   )rx   inputr   r)   r*   r     s    z;BaseSchedulerNode.get_estimated_runtime.<locals>.<listcomp>g      ?)rG   r   AssertionErrorr   r   devicerQ   r   r   rZ   r    r   kernel_name_to_oprw   r   Ztorch._subclasses.fake_tensorr   Ztorch.utils.flop_counterr   r
   r   inputs	__class__Zprocess_kernelkwargsZget_total_flopsrC   r   r   )r0   r   ZdtypeZgpu_memory_bandwidthZ	gpu_flopsr   r   r   Zflop_counter_modeZfake_inputsclsfactorZcounted_flopsr)   r   r*   get_estimated_runtime  sN    





6 z'BaseSchedulerNode.get_estimated_runtimeN)T)1r8   r9   r:   r
   Bufferr2   r;   r#   rb   rY   re   r   rj   rk   r   rs   r   r}   r~   r   r   r	   
ReadWritesrI   r   r   r|   r   r   r   r7   r   r   r   r   r5   r   r   r@   	MemoryDepr   r   r   r   r   intr   floatr   r)   r)   r)   r*   r<   P   sJ    
 O
)c                   @   s:   e Zd ZedddZdd Zdd Zejdd	d
Z	dS )r   rR   c                 C   s   |    dt| jdd  S )Nz.node.kernel = r   )r7   r   rG   r4   r)   r)   r*   rY     s    z)ExternKernelSchedulerNode.debug_str_extrac                 C   s   dS NTr)   r4   r)   r)   r*   r     s    z#ExternKernelSchedulerNode.is_externc                 C   s   t | jdo| j S )Nr   )r   rG   r   r4   r)   r)   r*   r     s    z*ExternKernelSchedulerNode.has_side_effectsr   c                 C   sz   |   s|  rdS |j| jjkr&dS t| jtjj	j
tjj	jfsFdS t| jjdkrvtt| jj}| | kS dS NFr   )r~   r   r`   rF   r   r    rG   r   r   r
   r   r   r   rV   rW   nextiternumbytes_hintr0   r   	write_depr)   r)   r*   r     s     z%ExternKernelSchedulerNode.can_inplaceN)
r8   r9   r:   r#   rY   r   r   r	   r   r   r)   r)   r)   r*   r     s   r   c                   @   s   e Zd ZdS )r   N)r8   r9   r:   r)   r)   r)   r*   r     s   r   c                       s   e Zd Zdejd fddZedddZdd	 Zd
d Z	dd Z
dd Zdd Zdd Zdd Zdd ZejdddZ  ZS )r   rD   rE   c                    sn   t  || | \| _| _| || jf| _|  rJ| |	  n | t
j| jf| jddi d S )N	normalizeT)superr2   Zsimplify_and_reorder_sizes_bodyr   groupr   rI   Znormalized_read_writesr	   extract_read_writes)r0   rF   rG   group_fnr   r)   r*   r2     s"    zSchedulerNode.__init__rR   c                 C   s   |   }| d| jd  | d| jd  | d| j g}|  rb|| dt|    |  r|| dt|    t| jt	j
r|d| d	 |t| j d
 d|S )Nz.group.device = r   z.group.iteration = r   z	.sizes = z.aliases = z.mutations = zclass z_loop_body:r   r   )r7   r  r  r~   r   r%   r   r    r  r
   ZLoopBodyr&   r   rb   r]   r_   r)   r)   r*   rY     s    zSchedulerNode.debug_str_extrac                 C   s   | j S r,   )r  r4   r)   r)   r*   
get_ranges   s    zSchedulerNode.get_rangesc                 C   s   t | j S r,   )r   rG   Zget_reduction_typer4   r)   r)   r*   r5   #  s    zSchedulerNode.is_reductionc                 C   s   t | jtjS r,   )r    rG   r
   TemplateBufferr4   r)   r)   r*   r   &  s    zSchedulerNode.is_templatec                 G   s   |    | | d S r,   )mark_runr   )r0   
index_varsr)   r)   r*   run)  s    zSchedulerNode.runc                 C   s   |    d S r,   )r   r4   r)   r)   r*   r  -  s    zSchedulerNode.mark_runc                 C   sH   | j }ttt|ttt|ks&ttttj	|tj	|}|S r,   )
r  summapr   r   dictzipr   r   from_iterable)r0   r  sizes
var_rangesr)   r)   r*   ranges_from_index_vars0  s     

z$SchedulerNode.ranges_from_index_varsc              
   C   sz   |  |}zFttt |( tj|  | j|  W 5 Q R X W 5 Q R X W n$ tk
rt   t	
d| j  Y nX d S )NzError in codegen for %s)r  r   Zset_ops_handlerr   Zget_ops_handlerr   Zset_current_noder  rZ   r[   fatalrG   )r0   r  r  r)   r)   r*   r   ;  s    

"zSchedulerNode.codegenc                    s$   j \}  fdd}t||S )zH
        Get the memory dependencies in the non-reduction axis.
        c                    s    | dd  D S )Nc                 S   s   g | ]}t d qS )r   )sympyInteger)rx   _r)   r)   r*   r   M  s     zCSchedulerNode.pointwise_read_writes.<locals>.fn.<locals>.<listcomp>)r  )indexZreduction_sizesr0   r)   r*   fnL  s    z/SchedulerNode.pointwise_read_writes.<locals>.fn)r  r	   r  )r0   r  r  r)   r  r*   pointwise_read_writesF  s    
z#SchedulerNode.pointwise_read_writesr   c                 C   s\   |   s|  rdS t| jjdkrXt|tjrXtt	| jj}|j
|j
koV|j|jkS dS r   )r~   r   r   rV   rW   r    r	   r   r   r   r  sizer   r)   r)   r*   r   Q  s     zSchedulerNode.can_inplace)r8   r9   r:   r
   r   r2   r#   rY   r	  r5   r   r  r  r  r   r  r	   r   r   __classcell__r)   r)   r  r*   r     s   r   c                       s  e Zd ZdZeeedddZdee dddZ	e
ed	d
dZed	ddZe
ee d	ddZed	ddZee eeef d fddZe
ee d	ddZe
ee d	ddZee d	ddZdd Ze
dd Ze
dd  Zd!d" Zd#d$ Ze
d%d& Ze
d'd( Zeeef d)d*d+Zd,d- Zed. d/d0d1Zd2d3 Z d4d5 Z!e"j#d6d7d8Z$d9d: Z%d;d< Z&  Z'S )=rC   z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    r=   c                 C   s(   |j |j kst| |j | |  S r,   )rF   r   r   )r   r>   r?   r)   r)   r*   rB   c  s    zFusedSchedulerNode.fuserD   )rF   r   c                    s   | _ | _d  _d  _g  _t|dd dj _tjdd |D   _	 
tjdd |D   fddtjd	d |D  D  jj  _td
d  j D  _tdd  j D  _d S )Nc                 S   s   t |  S r,   )r   r5   r   r)   r)   r*   r   o  r   z-FusedSchedulerNode.__init__.<locals>.<lambda>r   c                 S   s   g | ]
}|j qS r)   )rJ   r   r)   r)   r*   r   q  s     z/FusedSchedulerNode.__init__.<locals>.<listcomp>c                 S   s   g | ]
}|j qS r)   )rV   r   r)   r)   r*   r   u  s     c                    s   h | ]}|j   kr|qS r)   r`   r   r   r4   r)   r*   r{   x  s   z.FusedSchedulerNode.__init__.<locals>.<setcomp>c                 S   s   g | ]
}|j qS r)   r   r   r)   r)   r*   r   z  s     c                 S   s   g | ]
}|j qS r)   rK   r   r)   r)   r*   r   }  s     c                 S   s   g | ]
}|j qS r)   )rL   r   r)   r)   r*   r   ~  s     )r   rF   rG   rH   r.   maxr  r!   unionrJ   rI   r	   r   
merge_listrV   rW   r-   minrK   rL   )r0   rF   r   r)   r4   r*   r2   h  s&    
zFusedSchedulerNode.__init__rR   c                 C   s   d dd | jD S )Nr  c                 S   s   g | ]}|  qS r)   r   r   r)   r)   r*   r     s     z/FusedSchedulerNode.get_name.<locals>.<listcomp>)r]   r   r4   r)   r)   r*   r7     s    zFusedSchedulerNode.get_namec                 C   s   | j d  S Nr   )r   r7   r4   r)   r)   r*   r     s    z!FusedSchedulerNode.get_first_namec                 C   s   t jdd | jD  S )Nc                 S   s   g | ]}|  qS r)   )r   r   r)   r)   r*   r     s     z0FusedSchedulerNode.get_names.<locals>.<listcomp>r!   r#  r   r4   r)   r)   r*   r     s    zFusedSchedulerNode.get_namesc                    s.    fddt  jD }td| dS )Nc                    s,   g | ]$\}}    d | d|  qS )z.snodes[z] =
)r7   rb   )rx   irG   r4   r)   r*   r     s   z6FusedSchedulerNode.debug_str_extra.<locals>.<listcomp>r   r   )	enumerater   r&   r   r]   r^   )r0   ra   r)   r4   r*   rY     s    
z"FusedSchedulerNode.debug_str_extrart   c                    s@   t  || t }t| jD ]}||| ||j qd S r,   )r  r}   r!   reversedr   updaterM   )r0   ru   rv   rG   r  r)   r*   r}     s
    z!FusedSchedulerNode.set_last_usagec                 C   s   t jdd | jD  S )Nc                 S   s   g | ]}|  qS r)   )r   r   r)   r)   r*   r     s     z8FusedSchedulerNode.used_buffer_names.<locals>.<listcomp>r'  r4   r)   r)   r*   r     s    z$FusedSchedulerNode.used_buffer_namesc                 C   s   t jdd | jD  S )Nc                 S   s   g | ]}|  qS r)   )r|   r   r)   r)   r*   r     s     zCFusedSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>r'  r4   r)   r)   r*   r|     s    z/FusedSchedulerNode.used_or_aliased_buffer_namesc                 C   s   | j S r,   )r   r4   r)   r)   r*   r     s    zFusedSchedulerNode.get_nodesc                 C   s   t | j d|   dS )Nz(nodes=rO   rP   r4   r)   r)   r*   r;     s    zFusedSchedulerNode.__repr__c                 C   s   t dd | jD S )Nc                 s   s   | ]}|  V  qd S r,   )r5   r   r)   r)   r*   	<genexpr>  s     z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>anyr   r4   r)   r)   r*   r5     s    zFusedSchedulerNode.is_reductionc                 C   s   t dd | jD S )Nc                 s   s   | ]}|  V  qd S r,   )r   r   r)   r)   r*   r,    s     z1FusedSchedulerNode.is_template.<locals>.<genexpr>r-  r4   r)   r)   r*   r     s    zFusedSchedulerNode.is_templatec                 C   s   dS r3   r)   r4   r)   r)   r*   r@     s    zFusedSchedulerNode.is_foreachc                 C   s
   | j d S r&  )r  r4   r)   r)   r*   r     s    zFusedSchedulerNode.get_devicec                 C   s   t dd | jD S )Nc                 s   s   | ]}|  V  qd S r,   )r   r   r)   r)   r*   r,    s     z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>r-  r4   r)   r)   r*   r     s    z+FusedSchedulerNode.has_aliasing_or_mutationc                 C   s&   t  }| jD ]}||  q|S r,   )r   r   r   r+  r   )r0   r   rG   r)   r)   r*   r     s    
zFusedSchedulerNode.op_countsrf   c                 C   s   t d S r,   NotImplementedErrorri   r)   r)   r*   rj     s    z'FusedSchedulerNode.update_mutated_namesc                 C   s   t d S r,   r/  r0   r`   r)   r)   r*   rk     s    z#FusedSchedulerNode.add_mutation_deprl   rm   c                 C   s   t d S r,   r/  )r0   rH   r)   r)   r*   rs     s    zFusedSchedulerNode.set_usersc                 C   s   t d S r,   r/  r4   r)   r)   r*   r~     s    zFusedSchedulerNode.get_aliasesc                 C   s   t d S r,   r/  r4   r)   r)   r*   r     s    z FusedSchedulerNode.get_mutationsr   c                 C   s   t d S r,   r/  r   r)   r)   r*   r     s    zFusedSchedulerNode.can_inplacec                 C   s   t d S r,   r/  r4   r)   r)   r*   r     s    zFusedSchedulerNode.allocatec                 C   s   t d S r,   r/  r4   r)   r)   r*   r     s    zFusedSchedulerNode.can_free)(r8   r9   r:   __doc__classmethodr<   rB   r   r   r2   r   r#   r7   r   r   r   rY   r   r}   r   r|   r   r;   r5   r   r@   r   r   r   rj   rk   rs   r~   r   r	   r   r   r   r   r  r)   r)   r  r*   rC   \  sJ    




rC   c                       s   e Zd ZdZdd Zdd Zedd Zedd	 Zdde	e
 d fddZdd Zdd Zdd Zdd Zdd Zdd Zdd Z  ZS )rA   z{Scheduler node which consists of a list of scheduler nodes that each operate on a
    distinct tensor in a list of tensors.c                 C   s    |  | jkr| j|   S d S r,   )r7   read_to_node)r0   producerr)   r)   r*   get_consumer_subnode_for  s    z3ForeachKernelSchedulerNode.get_consumer_subnode_forc                 C   s.   |j jD ] }|j| jkr| j|j   S qd S r,   )rV   rX   r`   r   )r0   consumerrdr)   r)   r*   get_producer_subnode_for  s    z3ForeachKernelSchedulerNode.get_producer_subnode_forc                    s      rD|  rDt jt|jkoBt fddt j|jD S |  rp| }|d k	rl|j |S dS    r |}|d k	r j||S dS t	dd S )Nc                 3   s    | ]\}} j ||V  qd S r,   )rF   can_fuserx   lrr5  r)   r*   r,    s   z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>FzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)
r@   r   r   allr  r6  rF   r:  r9  r   )r   r5  r7  consumer_subnodeproducer_subnoder)   r>  r*   r:    s"     

z#ForeachKernelSchedulerNode.can_fusec           
      C   s  |  s|  std }d }|  rF|  rFdd t|j|jD }n|  r||}g }|}d }|jD ]2}||krt||}|}|| qj|| qjnX|  r||}	g }|}d }|jD ]2}||	krt||}|}|| q|| q| |j	|||S )Nc                 S   s   g | ]\}}t ||qS r)   )rC   rB   r;  r)   r)   r*   r     s   z3ForeachKernelSchedulerNode.fuse.<locals>.<listcomp>)
r@   r   r  r   r9  rC   rB   r   r6  rF   )
r   r5  r7  prev_node_1prev_node_2fused_nodesrA  rG   new_noder@  r)   r)   r*   rB     s<    



zForeachKernelSchedulerNode.fuseNrD   )rF   nodesc           
         sP  i  _ i  _|d ks|d krht || |D ]6}|jjD ]}| j |j< q:| D ]}| j|< qTq.n| _| _	 
tj|j|jg  fddt|j|jD  jj  _t|j|jg _t|j|jg _| r|n|}| r|n|}	|j _ j|	j |j _|	 D ]}|	 j|< q |d  df _t  _d S )Nc                    s   h | ]}|j   kr|qS r)   r   r   r4   r)   r*   r{   O  s   z6ForeachKernelSchedulerNode.__init__.<locals>.<setcomp>r   )r4  r   r  r2   rV   rX   r`   r   rF   r   rI   r	   r   r$  r!   r#  r-   rW   r%  rK   r"  rL   r@   rJ   r+  r   r  r   )
r0   rF   rF  rB  rC  rG   r   r`   Zforeach_node
other_noder  r4   r*   r2   2  sF    

 z#ForeachKernelSchedulerNode.__init__c                 C   s   t d S r,   r/  r4   r)   r)   r*   r  h  s    z#ForeachKernelSchedulerNode.mark_runc                 C   s   | j  | j    d S r,   )rG   Zget_store_functionZmake_loaderr4   r)   r)   r*   r   k  s    z"ForeachKernelSchedulerNode.codegenc                 C   s   t S r,   r/  r4   r)   r)   r*   r   n  s    z#ForeachKernelSchedulerNode.can_freec                 C   s   dS r   r)   r4   r)   r)   r*   r@   q  s    z%ForeachKernelSchedulerNode.is_foreachc                 C   s
   t | jS )zReturns a list of nodes which comprise the foreach kernel, operating on corresponding elements of our input lists.
        These nodes may be vertically fused.)rp   r   r4   r)   r)   r*   get_subkernel_nodest  s    z.ForeachKernelSchedulerNode.get_subkernel_nodesc                 C   s   t tjdd | jD  S )ziReturns all nodes contained in this kernel, unpacking fused nodes into their constituent scheduler nodes.c                 S   s   g | ]}|  qS r)   )r   r   r)   r)   r*   r   {  s     z8ForeachKernelSchedulerNode.get_nodes.<locals>.<listcomp>)rp   r   r   r   r4   r)   r)   r*   r   y  s    z$ForeachKernelSchedulerNode.get_nodesc                 C   s   | j d  S r&  )r   r   r4   r)   r)   r*   r   }  s    z)ForeachKernelSchedulerNode.get_first_name)NN)r8   r9   r:   r2  r6  r9  r3  r:  rB   r   r   r2   r  r   r   r@   rH  r   r   r  r)   r)   r  r*   rA     s(   

*  6rA   r)   c                    s`   t j fdd}ttttd }t|dkrJfdd|D tjr\|j|d |S )z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    c                    s     dks dkr2t   dk dkS  fddD }fddD }tdd t||D }tdd t||D }||krdS ||krdS t  S )	Nr   c                    s   g | ]}|  qS r)   r)   rx   sl)ar)   r*   r     s     z6pick_loop_order.<locals>.index_cmp.<locals>.<listcomp>c                    s   g | ]}|  qS r)   r)   rI  )br)   r*   r     s     c                 s   s"   | ]\}}|d kp||k V  qdS r   Nr)   rx   Zsl_aZsl_br)   r)   r*   r,    s    z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>c                 s   s"   | ]\}}|d kp||k V  qdS rM  r)   rN  r)   r)   r*   r,    s    r   )r   r  r  )rK  rL  Zstride_len_aZstride_len_bZa_firstZb_firstr  stride_lengths)rK  rL  r*   	index_cmp  s    z"pick_loop_order.<locals>.index_cmpr   c                    s   g | ]} | qS r)   r)   )rx   pi)rP  r)   r*   r     s     z#pick_loop_order.<locals>.<listcomp>r   )		functools
cmp_to_keyrp   r*  ranger   r   Zpick_loop_orderssort)rP  r  Zpriority_idxrQ  orderr)   rO  r*   pick_loop_order  s    rX  c                   @   sF   e Zd ZU eed< dZeed< dZeed< dd Zd d ddd	Z	d
S )rl   rG   Fr   is_weakc                 C   s
   | j  S r,   r   r4   r)   r)   r*   r7     s    zNodeUser.get_name)otherrS   c                 C   s.   | j |j kstt| j | jo |j| jo*|jS r,   )rG   r   rl   r   rY  )r0   rZ  r)   r)   r*   ro     s    

zNodeUser.mergeN)
r8   r9   r:   r<   __annotations__r   r   rY  r7   ro   r)   r)   r)   r*   rl     s
   
rl   c                       sB  e Zd Ze fddZdd Zdd Zdd Zd	d
 Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd ZeedddZeedd d!Zd"d# Zeedd$d%Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zed6d7d8Z e!j"d9d:d;Z#e!j"d9d<d=Z$d>d? Z%ed@dA Z&  Z'S )BrD   c                    sl  t    i  _i  _g  _tjj tjj	  _
 fdd|D  _ j
tjj	   jD ]}|  qddd  jD  _d  _i  _i  _            t jt j7  _tj j t j _dd  jD  _            tj j tj j     d  _!t"  _#t"  _$i  _%t&'dt j d S )Nc                    s   g | ]}  |qS r)   )create_scheduler_noderx   nr4   r)   r*   r     s     z&Scheduler.__init__.<locals>.<listcomp>c                 S   s   i | ]}|  |qS r)   r   r]  r)   r)   r*   
<dictcomp>  s      z&Scheduler.__init__.<locals>.<dictcomp>c                 S   s   i | ]}|  |qS r)   r   r]  r)   r)   r*   r_    s      z)Number of scheduler nodes after fusion %d)(r  r2   backendsZ
fuse_cacherF  r   r   r   keys	constantsr   r+  r   r   r   rv   mutation_renamescompute_dependenciestopological_sort_scheduledead_node_eliminationcompute_predecessorsr   Zir_nodes_pre_fusionr   debugZir_pre_fusionZnum_orig_nodescreate_foreach_nodes
fuse_nodescompute_last_usageZir_post_fusionZgraph_diagramdebug_draw_graphcurrent_devicer!   buffer_names_to_freebuffer_names_no_longer_neededorigin_to_indexr[   rd   )r0   rF  rG   r  r4   r*   r2     sF    




zScheduler.__init__c                 C   s0   t jdddkr,ddlm} || jdd dS )z,Generate an image of the graph for debuggingZINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)Zprint_graph)osenvironrw   rh  rr  rF  )r0   rr  r)   r)   r*   rl    s    zScheduler.debug_draw_graphc                 C   s0   t tjr,t d| | jD ]}|  qd S )Nz%s:)r[   isEnabledForloggingINFOrd   rF  re   )r0   labelrG   r)   r)   r*   debug_print_nodes   s    
zScheduler.debug_print_nodesc                 C   st   |j d k	std| r$t| |S t|tjtjfrR| |	 j
}t| ||S t|tjrht| |S t|d S )Nz2All nodes passed to scheduling must have an origin)r   r   Zis_no_opr   r    r
   r   r
  get_backendr   r  r   ZExternKernelr   r0  )r0   rG   r  r)   r)   r*   r\    s    

zScheduler.create_scheduler_nodec                    s   t  g }j  tjj D ]`}|  fdd|D }|sFq fdd|D }t|}|	| |D ]}|j|< qpq fddj
D | _
d S )Nc                    s   g | ]}| kr|qS r)   r)   rx   r`   )kept_node_namesr)   r*   r     s      z2Scheduler.create_foreach_nodes.<locals>.<listcomp>c                    s   g | ]} j | qS r)   )r   r{  r4   r)   r*   r   !  s     c                    s   g | ]}|   kr|qS r)   r   )rx   rG   )removed_node_namesr)   r*   r   )  s     )r!   r   ra  r   r   listsrq   r+  rA   r   rF  )r0   Zfe_nodesnamesr   Zfe_noder`   r)   )r|  r}  r0   r*   ri    s$    




zScheduler.create_foreach_nodesc                    s  t tjD ]}| }| D ]~}|kr|kr| }| }|| } D ]$}| |kst| |krX||< qXq$|kr| |< q$| |< q$qfdd fdd dfdd	}jD ]}	|	 D ]r}
|
}
||
|	 |	t	|
 |
 D ]D}| } |	 }||kr|	t
| |||	dd	 qq|	jjD ]&}t|t
}||j|	|	|| qb|	j |	 D ]>}
|	 j|
< |	 j|
< j|
|
j|	 < qqtj D ]}||tt	| qjD ]4}|tjjkr
||tt	| tjj| q
d
d ttjj D fddtjjD tj_jD ]}	|	|	   qzjD ]"}	|	jD ]}|jj !|	 qqdS )zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        c                    s   | j kr j |  S | S r,   )rc  r^  )rh   r0   r)   r*   rh   G  s    
z.Scheduler.compute_dependencies.<locals>.renamec                    s~   | h}j |  }t|jjd }|jjD ]P}|jj kr(t|tjr(t|tjr(|j	|j	kr(|j
|j
kr(| |j q(|S r&  )r   rp   rV   rW   rX   r`   r    r	   r   r  r  r+  )	node_nameZreachable_namesrG   r   r   )dep_closurer0   r)   r*   r  L  s     





z3Scheduler.compute_dependencies.<locals>.dep_closureFc                    s    |   t||| d S r,   )r   rl   )Zused_by_nameZ	user_noder   rY  )name_to_usersrh   r)   r*   add_user[  s    
z0Scheduler.compute_dependencies.<locals>.add_userT)rY  c                 S   s   i | ]\}}||qS r)   r)   )rx   r  r`   r)   r)   r*   r_    s     z2Scheduler.compute_dependencies.<locals>.<dictcomp>c                    s   g | ]} | qS r)   r)   r{  )	inp_namesr)   r*   r     s    z2Scheduler.compute_dependencies.<locals>.<listcomp>N)FF)"r   defaultdictrp   rF  r7   r~   ra  r   rk   r   r   rV   rX   r    r`   r   rj   rc  rv   rw   r   r   get_output_namesr+   r   Zmutated_inputsr   r)  Zmutated_input_idxsrs   rH   rG   r.   r   )r0   r>   Z
node1_nameZ
node2_nameZlist1Zlist2combinedr   r  rG   Zalt_namerG  Z
other_nameZknown_dep_node_namesr   rY  r  r`   r   r)   )r  r  r  rh   r0   r*   rd  -  sp    




 





zScheduler.compute_dependenciesc                    s   d}|rg }| j D ]f}tddd |  oDt fdd|jD }|sV|| qtd|  t	j
j|  qt| j t|k}|| _ q| j D ]}|  qdS )	z0
        Remove any nodes without users
        Tr   c                 S   s   | j p|  tjjkS r,   )rY  r7   r   r   r   r  r)   r)   r*   can_eliminate_user  s    z;Scheduler.dead_node_elimination.<locals>.can_eliminate_userc                 3   s   | ]} |V  qd S r,   r)   )rx   ur  r)   r*   r,    s    z2Scheduler.dead_node_elimination.<locals>.<genexpr>zremoved dead node: %sN)rF  rl   r   r?  rH   r   r[   rh  r7   r   r   r   r   r   r   )r0   ZagainZupdated_nodesrG   Zcan_eliminater)   r  r*   rf    s     

zScheduler.dead_node_eliminationc                    sb   t  t  g  fdd| jD ]}| D ]}| |< q4q(| jD ]}| qJ| _dS )zD
        Ensure self.nodes is in topologically sorted order
        c                    sF   | krB |  t| jdd dD ]} |j  q$|  d S )Nc                 S   s   | j S r,   r   )dr)   r)   r*   r     r   zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>r   )r   r"   r-   r`   r   )r^  r1   r   r(   seenvisitr)   r*   r    s
    
z2Scheduler.topological_sort_schedule.<locals>.visitN)r!   r  rF  r   )r0   rG   r`   r)   r  r*   re    s    


z#Scheduler.topological_sort_schedulec                 C   sr   i }| j D ]B}t }|jD ]}||j |||j O }q||| < ||_q
t| j D ]\}}||_||_	qXdS )z;
        Populate each node.recursive_predecessors
        N)
rF  r!   r-   r   r`   r7   rJ   r)  rK   rL   )r0   Zname_to_predecessorsrG   rJ   r1   rW  r)   r)   r*   rg    s    

zScheduler.compute_predecessorsc                 C   s6   t dD ](}t| j}|   t| j|kr q2qdS )zO
        Mutates self.nodes to combine nodes into FusedSchedulerNodes.
        
   N)rU  r   rF  fuse_nodes_once)r0   r  Zold_lenr)   r)   r*   rj    s
    
zScheduler.fuse_nodesc                    s   t | j}|  D ]\}}| j|  }| j|  }| ||r| ||st|| || || |	  | j
 fdd  D  qt|dd d| _|   |   dS )a  
        Mutates self.nodes to combine nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuses(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        c                    s   i | ]}|   qS r)   r   r]  Znode3r)   r*   r_    s      z-Scheduler.fuse_nodes_once.<locals>.<dictcomp>c                 S   s   | j S r,   r!  r   r)   r)   r*   r      r   z+Scheduler.fuse_nodes_once.<locals>.<lambda>r   N)r!   rF  get_possible_fusionsr   r   r:  will_fusion_create_cyclerB   remover   r+  r   r"   re  r   )r0   rD  r>   r?   r)   r  r*   r    s$    
 



zScheduler.fuse_nodes_oncec                 C   s   | j D ]}|| j qd S r,   )rF  r   r   )r0   rG   r)   r)   r*   r     s    
zScheduler.prune_redundant_depsc                    s   g  t   fdd}tt}jD ] }| D ]}|| | q6q*| D ]}|| qTtj	rtt}jD ]"}t
|dd}|rx|| | qx| D ]}|| qt jddS )z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        c                    s   t | D ]|\}}| |d d  D ]b}||f}|kr6q | ||rX | q | sh| r ||r  ||f q qd S )Nr   )r)  r   r:  r   r   r@   )rF  Znode1_indexr>   r?   r   Zpossible_fusionsr  r0   r)   r*   check_all_pairs  s    
 z7Scheduler.get_possible_fusions.<locals>.check_all_pairsr  NT)r   reverse)r!   r   r  rp   rF  r   r   rq   r   aggressive_fusionr   r"   score_fusion_key)r0   r  Zbuffer_names_groupingrG   r   Znode_groupingZgroup_groupingr  r)   r  r*   r    s$    





zScheduler.get_possible_fusionsc                    sR    fdd t  | | B |j|jB  t fddD S )zHFinds whether there's a path from src to dst caused indirectly by fusionc                    sn   t | trj| krj|  t| j@ }|r2|S |  }|}|rL|S t fdd| j D S dS )Nc                 3   s   | ]} j | V  qd S r,   r   r]  checkr0   r)   r*   r,  B  s   zDScheduler.will_fusion_create_cycle.<locals>.check.<locals>.<genexpr>F)r    rC   r   r   rJ   r   issubsetr.  )rG   Zcond0r  Zshortcutr  Zcombined_namesZcombined_predecessorsr0   visitedr)   r*   r  4  s    

z1Scheduler.will_fusion_create_cycle.<locals>.checkc                 3   s   | ]} j | V  qd S r,   r  r]  r  r)   r*   r,  M  s     z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>)r!   r   rJ   r.  r0   r>   r?   r)   r  r*   r  1  s    
z"Scheduler.will_fusion_create_cycler=   c                 C   s*   t t|j|j t|j|j }|dkS )aA  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heursitic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )r"  absrK   rL   )r0   r>   r?   proximity_scorer)   r)   r*   can_fusion_increase_peak_memoryO  s
    z)Scheduler.can_fusion_increase_peak_memoryc                 C   sv  ||krdS t |ttfr&| s&dS t |ttfr@| s@dS | sP| r\t||S | |j@ rndS | rzdS | r|	 s|
 stjsdS | }|| krdS | ||dk}|rtjr|
 s|
 rdS | s| st| t|  tjkrdS | |j@ rN| ||s<dS | |||S | ||r`dS | |||S dS )zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        Fr   N)r    r   r   r   r@   rA   r:  r   rJ   r   r5   r   Zepilogue_fusionr   score_fusion_memoryr  r   r   Zmax_fusion_sizecan_fuse_verticalrz  r  can_fuse_horizontal)r0   r>   r?   r   Zno_shared_datar)   r)   r*   r:  j  sh    zScheduler.can_fusec           	      C   s   |  }t }|jD ]}|jjD ]|}|j|jkr t|t|kr t|jds t|jds |j|jkr t	|j
t	|j
kr |j
dt	|j
 |j
kr || q qdd |j| D }||@ rdS |D ]}|| j| j@ r dS qdS )a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.
        tmpNc                 S   s   h | ]
}|j qS r)   r   r   r)   r)   r*   r{     s     z.Scheduler.can_fuse_vertical.<locals>.<setcomp>FT)r   r!   r-   rV   rW   r`   rQ   r   r  r   r  r   r   rJ   )	r0   r>   r?   Znode1_namesZcomputed_depsr8  cdZremaining_depsr`   r)   r)   r*   r    s4    




	zScheduler.can_fuse_verticalc                 C   sb   |  ||}tt|j|j t|j|j  }| tjkoD|dk| | koZ|dk||fS )a\  
        Assign a score (higher comes first) to the fusion of node1
        and node2.  When different fusions conflict with each other,
        this is the way we decide what order to run them in.

        Our current score is based on:
        - Estimate of the saved memory operations
        - Fusions closer together in original order
        r   )	r  r"  r  rK   rL   r   r   Zepilogue_fusion_firstr5   )r0   r>   r?   Zmemory_scorer  r)   r)   r*   score_fusion  s    
zScheduler.score_fusionc                 C   s2   |j j|j jB |j j|j jB @ }tdd |D S )zf
        The first term in our fusion score that estimates number of saved memory operations.
        c                 s   s   | ]}|  V  qd S r,   )r   r   r)   r)   r*   r,    s     z0Scheduler.score_fusion_memory.<locals>.<genexpr>)rV   rX   rW   r  )r0   r>   r?   Zcommon_memory_depsr)   r)   r*   r    s    zScheduler.score_fusion_memoryc                 C   s   |\}}|  ||S )z-
        Shim for list.sort(key=...)
        )r  )r0   rF  r>   r?   r)   r)   r*   r    s    zScheduler.score_fusion_keyc                 C   sN   t  }tj D ]}|| qt| jD ]}||| j |	|j
 q*dS )zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)r!   r   r   r  r   r*  rF  r}   rv   r+  rM   )r0   ru   r  rG   r)   r)   r*   rk    s    zScheduler.compute_last_usagec                 C   s   t | jtjj tjjj D ]h}|| jkrN| j| }| rtjj	|j
 q|tjjkrtjj| j}| stttjj	|j q| j  dS )z*Free any buffers that are no longer neededN)r"   rn  r   r   r   r   Zfreedr   r   Zcodegen_freerG   r   r   Zis_input_bufferr   clear)r0   r`   rG   Zstorager)   r)   r*   free_buffers  s     

zScheduler.free_buffersc                    s   t jjj@  fdd}tt|   D ]v}|t jjjkrt jjj| }t|t	rb|
drbq,t fdd|jD }|r| t jj| q,| q,dS )zr
        Any buffers that are both created and have a last use in the
        same kernel can be removed.
        c                    s.   | t jjko,| t jjjko,|  jko,|  jkS r,   )r   r   Zmust_keep_buffersr   Zinput_buffersrc  rv   r  r4   r)   r*   remove_filter  s    z<Scheduler.remove_kernel_local_buffers.<locals>.remove_filterREMOVEDc                 3   s   | ]}| kV  qd S r,   r)   r]  )names_to_remover)   r*   r,  -  s     z8Scheduler.remove_kernel_local_buffers.<locals>.<genexpr>N)r   r   Zstore_buffer_namesro  rp   filterr   r   r    r#   
startswithr?  Zother_namesremove_inplace_bufferr   Zinplaced_to_remover   remove_buffer)r0   r  r`   r   r  r)   )r  r0   r*   remove_kernel_local_buffers  s    
z%Scheduler.remove_kernel_local_buffersc                 C   s,   t d| dtjjj|< tjj| d S )Nzremove_buffer(%r)r  )	r[   rh  r   r   r   Zoutput_buffersr   r   r   r1  r)   r)   r*   r  4  s    zScheduler.remove_bufferc                 C   sD   t d| tjjj| j}|ddtjjj|< tjj	
| d S )Nzremoving_inplace_buffer(%r)Z
in_out_ptrr  )r[   rh  r   r   r   r   
inner_namer   r   r   r   )r0   r`   r  r)   r)   r*   r  <  s     zScheduler.remove_inplace_bufferc                 C   s$   | j  D ]}|  q
|   d S r,   )r`  rq   flushr  )r0   backendr)   r)   r*   r  D  s    
zScheduler.flush)scheduler_nodec                 C   s6   t |tst|  |j}|tjj | 	  d S r,   )
r    r   r   r   rG   r   r   r   r   r  )r0   r  rG   r)   r)   r*   codegen_extern_callI  s
    zScheduler.codegen_extern_call)r   c                 C   s   |j dks"|jd k	s"t| dtjj|j  tj|j t|j }|d krbt	d|j  |j dkrt
 stj|}|jdk rt	d|j d|j d|j nt	d|| S )	Nr   z( should have been normalized in loweringzUnsupported device type:    zFound z which is too old to be supported by the triton GPU compiler, which is used as the backend. Triton only supports devices of CUDA Capability >= 7.0, but your device is of CUDA capability .zCannot find a working triton installation. More information on installing Triton can be found at https://github.com/openai/triton)rQ   r  r   r   r   Zdevice_typesr   Zadd_device_idxr   RuntimeErrorr   r   r   Zget_device_propertiesmajorr`   minor)r0   r   Zdevice_schedulingZdevice_propsr)   r)   r*   create_backendP  s*    

zScheduler.create_backendc                 C   s$   || j kr| || j |< | j | S r,   )r`  r  )r0   r   r)   r)   r*   rz  h  s    
zScheduler.get_backendc                    sD   fdd  fdd|  D }|r@t|\}}tjj| d S )Nc                    s2   |  j kr( j dd t| jjD   j |  S )Nc                 S   s   i | ]\}}||qS r)   r)   )rx   r(  r^  r)   r)   r*   r_  p  s      z>Scheduler.enter_context.<locals>.get_order.<locals>.<dictcomp>)rp  r+  r)  r   rF  r  r4   r)   r*   	get_ordern  s    
z*Scheduler.enter_context.<locals>.get_orderc                    s&   g | ]}|j jD ]} ||fqqS r)   )rG   r   )rx   r^  e)r  r)   r*   r   s  s     
  z+Scheduler.enter_context.<locals>.<listcomp>)r   r"  r   r   r   enter_context)r0   rG   r   r  lastr)   )r  r0   r*   r  m  s
    zScheduler.enter_contextc                 C   s  | j D ]}| | | j|j t|ts| }|| jksP|	 sP|
 rX|   || jkr|jdkr| jr| jjdkrtjj  |jd k	stdtjj|j n| jr| jjdkrtjj  || _| j|j |
 r
| ^}}| ||| np|	 r | | nZ| r<| || n>t|ttfrb| ||  nt|tsrt|  tj j!r| |"  | j#|$  q|   d S )Nr   zdevice should have an index)%rF  r  ro  r+  rM   r    r   r   rm  r   r   r  rQ   r   r   r   Zcodegen_device_guard_exitr  r   Zcodegen_device_guard_enterrn  r   rz  codegen_templater  r@   Zcodegen_foreachrC   r   codegen_nodesr   r   r   Zdebug_sync_kernelcodegen_syncr   r   )r0   rG   r   epiloguer)   r)   r*   r   x  sJ    







zScheduler.codegen)(r8   r9   r:   r   r2   rl  ry  r\  ri  rd  rf  re  rg  rj  r  r   r  r  r<   r  r:  r  r  r  r  rk  r  r  r  r  r  r   r  r   r   r  rz  r  r   r  r)   r)   r  r*   rD     sF   9m
) =(	 rD   c                   @   sj   e Zd ZeedddZeedddZdd Zeee dd	d
Zee dddZ	dd Z
dd ZdS )BaseSchedulingr=   c                 C   s
   t  dS )zO
        Check whether node1 and node2 can be vertically fused or not.
        Nr/  r  r)   r)   r*   r    s    z BaseScheduling.can_fuse_verticalc                 C   s
   t  dS )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        Nr/  r  r)   r)   r*   r    s    z"BaseScheduling.can_fuse_horizontalc                 C   s
   t  dS )z[
        Process the iteration sizes in case a transformation needs to be applied.
        Nr/  )r0   r  r)   r)   r*   r    s    zBaseScheduling.group_fn)template_nodeepilogue_nodesc                 C   s
   t  dS )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        Nr/  )r0   r  r  r)   r)   r*   r    s    	zBaseScheduling.codegen_template)rF  c                 C   s
   t  dS )zD
        Generate a kernel given a list of pre-fused nodes.
        Nr/  )r0   rF  r)   r)   r*   r    s    zBaseScheduling.codegen_nodesc                 C   s
   t  dS )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        Nr/  r4   r)   r)   r*   r    s    zBaseScheduling.codegen_syncc                 C   s
   t  dS )z]
        Flush the generated kernel and python wrapper code to the source code file.
        Nr/  r4   r)   r)   r*   r    s    zBaseScheduling.flushN)r8   r9   r:   r<   r  r  r  r   r  r  r  r  r)   r)   r)   r*   r    s    r  )r)   )@r   dataclassesrS  r   rv  rs  r$   r&   typingr   r   r   r   r  r   Ztorch._dynamo.utilsr   rc   r   r	   r
   r   Zcodegen.commonr   r   r   r   r   r   utilsr   r   r   r   r   r   r   r   Zvirtualizedr   	getLoggerr8   r[   r%   r+   rB   ZopsZatenZconvolutionmmZbmmZaddmmr   r<   r   r   r   rC   rA   rX  	dataclassrl   rD   r  r)   r)   r)   r*   <module>   s`   (


	    _  !
(     n