U
    9%e                     @   sv   d dl Z d dlZddlmZ ddlmZ ddlmZ ddlmZm	Z	m
Z
 dd Zd	d
 Zdd ZdddZdd ZdS )    N   )cdiv)runtime)driver)get_dram_gbpsget_max_simd_tflopsget_max_tensorcore_tflopsc                 C   s@   |t |d }tj|d d }t ||| t|| | }|S z# return compute throughput in TOPS    multiprocessor_count)minr   utilsget_device_propertiesr   backenddevicenum_ctas	num_warpsdtypeZtotal_warpsZnum_subcoresZtflops r   [/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/triton/ops/matmul_perf_model.pyget_tensorcore_tflops   s    r   c                 C   s@   |t |d }tj|d d }t ||| t|| | }|S r	   )r   r   r   r   r   r   r   r   r   get_simd_tflops   s    r   c                 C   sB   t j|}|d dk r2|t jkr2t| ||||S t| ||||S )Nr      )torchcudaget_device_capabilityfloat32r   r   )r   r   r   r   r   
capabilityr   r   r   
get_tflops   s    r   Fc           ,      K   s  t jj}tj }|j}| }t||}t||	}|}|| | }t	||t	||	 }}d| | | d }t
|||| |}|| }tj|d }td|| }td|d }t	td|d d d}t|||d |d	   }|d
 }|| | dd|d    }|| | d |d  } || | dd|d    }!|| | d |d  }"||! d }#| |" d }$|#| |$|  }%|d }&|| | | d }'|dkr|'|& }(n(|&})|'|) }(|| d d |& }*|(|*7 }(t	||%|( }+|rtd|+ d| d|% d|( d|d  d |+S )zO return estimated running time in ms
          = max(compute, loading) + store r   i   @r          L   r   gffffff?g?r
   g?g?i   g333333?zTotal time: zms, compute time: zms, loading time: zms, store time: zms, Activate CTAs: d   %)r   r   CUDAr   r   current_devicer   element_sizer   maxr   r   r   r   r   r   print),r   
num_stagesABCMNKBLOCK_MBLOCK_NBLOCK_KSPLIT_Kdebugkwargsr   r   r   dtsizeZ	num_cta_mZ	num_cta_nZ	num_cta_kr   Z	total_opsZtputZ
compute_msZnum_smZactive_cta_ratioZactive_cta_ratio_bw1Zactive_cta_ratio_bw2Zdram_bwZl2_bwZload_a_dramZ	load_a_l2Zload_b_dramZ	load_b_l2Z
total_dramZtotal_l2Zload_msZstore_bwZstore_c_dramZstore_msZ	reduce_bwZzero_msZtotal_time_msr   r   r   estimate_matmul_time"   sJ    





,r8   c                    s  t j }t j }|d  }|d j}g }| D ]b}|j}|d |d |d |jf\}	}
}}tj	
|d }|	|
 | | | }||kr2|| q2|} |t jt jfkrdd | D } i }| D ]t}|j}|d |d |d |d |j|jf\}	}
}}}}|	|
|||f}||kr&|| ||f q||fg||< qg }| D ]\}}|\}	}
}}}|d	 d
kr|	|
 | d }|td| d
 }d}||  tjd| fddd}|D ]}||d	  qn|d	 d	 }d|_|| qB|S )Nr+   r1   r2   r3   Zmax_shared_memc                 S   s   g | ]}|j d  dkr|qS )r4   r    )r6   ).0configr   r   r   
<listcomp>z   s      z&early_config_prune.<locals>.<listcomp>r4   r   r   i   r
   i,  r   c                    s0   | d   dk r$dt | d    S | d   S )Nr    r   
   )abs)xZoptimal_num_stagesr   r   <lambda>   s    z$early_config_prune.<locals>.<lambda>)key)r   r   r&   r   r'   r   r6   r*   r   r   r   appendZfloat16r   r   itemsr   heapq	nsmallest)ZconfigsZ
named_argsr   r   r7   r   Zpruned_configsr:   kwr1   r2   r3   r*   Zmax_shared_memoryZrequired_shared_memoryZconfigs_mapr4   r   rA   kvZmmasZ
mma_cyclesZldgsts_latencyZnearestnZrandom_configr   r?   r   early_config_pruned   sP    



"
rJ   )F)rD   r    r   Z_C.libtriton.tritonr   r   testingr   r   r   r   r   r   r8   rJ   r   r   r   r   <module>   s    
B