U
    9%e(E                     @   s   d dl Z d dlZd dlZd dlZd dlmZ ddlmZ dd Zd'dd	Z	d(ddZ
d)ddZG dd dZG dd dZdd Zd*ddZd+ddZdd Zdd  Zed,d#d$Zd-d%d&ZdS ).    N)contextmanager   )runtimec                 C   sL   d | } dddd|  dg}t|}|tjjd}dd |D }|S )	N,
nvidia-smi-i0--query-gpu=--format=csv,noheader,nounitsc                 S   s   g | ]}t |qS  int.0xr   r   M/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/triton/testing.py
<listcomp>   s     znvsmi.<locals>.<listcomp>join
subprocesscheck_outputdecodesysstdoutencodingsplitattrscmdoutretr   r   r   nvsmi
   s    

r!      c              	      s  dd l j j kr$td|   |d k	rT|D ]}|  |d d |_q6j  j	  |   W 5 Q R X j
   fdd} jjdd}jjdd}|  |   |  j
  ||}tdt|| }fdd	t|D }fd
d	t|D }g }d}	t|	D ]}
j
  t|D ]@}|d k	r\|D ]}d |_qN||   |   ||   q<j
  dd	 t||D }|| q&| S )Nr   zQCannot capture graph in default stream. Please use side stream in benchmark code.Tc                      s      S N)Zreplayr   )gr   r   <lambda>,       z$do_bench_cudagraph.<locals>.<lambda>Zenable_timingr   c                    s   g | ]} j jd dqS Tr'   cudaEventr   itorchr   r   r   8   s     z&do_bench_cudagraph.<locals>.<listcomp>c                    s   g | ]} j jd dqS r(   r)   r,   r.   r   r   r   9   s     2   c                 S   s   g | ]\}}| |qS r   elapsed_timer   ser   r   r   r   K   s     )r/   r*   Zcurrent_streamZdefault_streamRuntimeErrorZdetach_Zrequires_grad_gradZ	CUDAGraphgraphsynchronizer+   recordr2   maxr   rangetensorzipappendminmeanitem)fnrepgrad_to_noner   start_event	end_eventestimate_msn_repeatr    Z	n_retries_r-   timesr   )r$   r/   r   do_bench_cudagraph   sN    








rL      d   TrA   c                    s  |dkst dd l |    j  |r@ jtd jdd}n jtd jdd} jjdd} jjdd}	|  t	d	D ]}
|
  |   q|	   j  ||	d	 }td
t|| }td
t|| } fddt	|D } fddt	|D }	t	|D ]}
|   qt	|D ]H}|d k	rD|D ]}d |_q6|
  ||   |   |	|   q$ j   jdd t||	D  jd}|d k	r؈ | j| jd }t|d
kr|d }|S t || S )N)r@   r;   rA   Zmedianr   g    Ar*   )dtypedeviceg    ATr'      r   c                    s   g | ]} j jd dqS r(   r)   r,   r.   r   r   r      s     zdo_bench.<locals>.<listcomp>c                    s   g | ]} j jd dqS r(   r)   r,   r.   r   r   r      s     c                 S   s   g | ]\}}| |qS r   r1   r3   r   r   r   r      s     )rO   )AssertionErrorr/   r*   r9   emptyr   int8r+   r:   r<   Zzero_r2   r;   r7   r=   r>   floatZquantiletolistlengetattrrB   )rC   ZwarmuprD   rE   Z	quantilesZ
fast_flushZreturn_modecacherF   rG   rJ   rH   Zn_warmuprI   r-   r   rK   r    r   r.   r   do_benchP   sL    





 
rZ    c                 C   sN  dd l }dd l}t| |js&|| } t||js<||}|d krHd}t|rZ|| jn|}|d krjd}t|r||| jn|}t| |jr| j|jkr|  } | 	 
   } t||jr|j|jkr| }|	 
   }| jdks|jdkr|jj| |||dd d S |j| |||dsJt| d|  d	| d
| d| d
d S )Nr   g{Gz?g        r   T)atolrtolZ	equal_nan)r\   r]    z is not close to z (atol=z, rtol=))numpyr/   
isinstanceZTensorr=   callablerO   bfloat16rU   cpudetachsizetestingZassert_allcloseZallcloserR   )r   yr\   r]   err_msgnpr/   r   r   r   assert_close   s2    

rk   c                   @   s   e Zd ZdZdddZdS )	Benchmarkzk
    This class is used by the :code:`perf_report` function to generate line plots with a concise API.
    r[   FNc                 C   sL   || _ || _|
| _|| _|| _|| _|| _|| _|| _|	| _	|| _
|| _dS )a  
        Constructor

        :param x_names: Name of the arguments that should appear on the x axis of the plot. If the list contains more than one element, all the arguments are assumed to have the same value.
        :type x_names: List[str]
        :param x_vals: List of values to use for the arguments in :code:`x_names`.
        :type x_vals: List[Any]
        :param line_arg: Argument name for which different values correspond to different lines in the plot.
        :type line_arg: str
        :param line_vals: List of values to use for the arguments in :code:`line_arg`.
        :type line_vals: List[str]
        :param line_names: Label names for the different lines.
        :type line_names: List[str]
        :param plot_name: Name of the plot.
        :type plot_name: str
        :param args: List of arguments to remain fixed throughout the benchmark.
        :type args: List[str]
        :param xlabel: Label for the x axis of the plot.
        :type xlabel: str, optional
        :param ylabel: Label for the y axis of the plot.
        :type ylabel: str, optional
        :param x_log: Whether the x axis should be log scale.
        :type x_log: bool, optional
        :param y_log: Whether the y axis should be log scale.
        :type y_log: bool, optional
        N)x_namesx_valsx_logline_arg	line_vals
line_namesy_logstylesxlabelylabel	plot_nameargs)selfrm   rn   rp   rq   rr   rw   rx   ru   rv   ro   rs   colorrt   r   r   r   __init__   s    *zBenchmark.__init__)r[   r[   FFNN)__name__
__module____qualname____doc__r{   r   r   r   r   rl      s         rl   c                   @   s&   e Zd Zdd Zdd Zd
ddZd	S )Markc                 C   s   || _ || _d S r#   )rC   
benchmarks)ry   rC   r   r   r   r   r{     s    zMark.__init__c              
      s  dd l }dd lm} dd l}|j}dd |jD }	dd |jD }
|j|jd g| |	 |
 d}|jD ]  fdd|jD }g g g   }}}|jD ]p}| j	f ||j
|i|j}z|\}}	}
W n$ tk
r   |d d   }}	}
Y nX ||g7 }||	g7 }||
g7 }q g| | | |jt|< qh|jr|  | }|jd  t|jD ]\}}||d  ||d	   }	}
|jr|j| d nd }|jr|j| d
 nd }|j|  || |||d |	d k	rR|
d k	rR|j|  |	|
d|d qR|  |jr
|jn
d|j}|| ||j ||jr<dnd ||jrRdnd |rf|   |r|!|j"||j d ||jd g|j  }|rt#|jd  t#| |r|j$|j"||j dddd d S )Nr   c                 S   s   g | ]}| d qS )-minr   r   r   r   r   r     s     zMark._run.<locals>.<listcomp>c                 S   s   g | ]}| d qS )-maxr   r   r   r   r   r     s     )columnsc                    s   i | ]
}| qS r   r   )r   Zx_namer   r   r   
<dictcomp>  s      zMark._run.<locals>.<dictcomp>r   r   r   )labelrz   Zlsg333333?)alpharz   z = logZlinearz.png:z.csvz%.1fF)Zfloat_formatindex)%osZmatplotlib.pyplotZpyplotZpandasrr   Z	DataFramerm   rn   rq   rC   rp   rx   	TypeErrorlocrW   rw   ZfigureZsubplot	enumeratert   ZplotZfill_betweenZlegendru   r   Z
set_xlabelZ
set_ylabelrv   Z
set_xscalero   Z
set_yscalers   showZsavefigpathprintZto_csv)ry   bench	save_path
show_plots
print_datar   ZpltpdZy_meanZy_minZy_maxZdfZx_argsZrow_meanZrow_minZrow_maxrh   r    Zaxr-   colZstyru   r   r   r   _run  s^     





z	Mark._runFr[   c                 C   s   t | jt}|r| jgn| j}|r@ttj|dd}|d |D ],}| |||| |rD|d|j	 d qD|r|d d S )Nzresults.htmlwz<html><body>
z<image src="z.png"/>
z</body></html>
)
ra   r   rl   openr   r   r   writer   rw   )ry   r   r   r   Zhas_single_benchr   htmlr   r   r   r   run:  s    
zMark.runN)FFr[   )r|   r}   r~   r{   r   r   r   r   r   r   r     s   3r   c                    s    fdd}|S )z
    Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value.

    :param benchmarks: Benchmarking configurations.
    :type benchmarks: List of :class:`Benchmark`
    c                    s
   t |  S r#   )r   )rC   r   r   r   r%   O  r&   zperf_report.<locals>.<lambda>r   )r   wrapperr   r   r   perf_reportH  s    r   c                 C   sf   ddl }ddlm} | s tjj} |s.|j }|j|d }|j|d }|| d d d	 }|S )
z return DRAM bandwidth in GB/s r   Nr   driverZmem_clock_rateZmem_bus_width   g    .A   )	r/   r   r   backendCUDAr*   current_deviceutilsget_device_properties)r   rP   r/   r   Zmem_clock_khzZ	bus_widthZbw_gbpsr   r   r   get_dram_gbpsS  s    
r   c           
      C   s   dd l }ddlm} |s tjj}|s.|j }|j|d d }|sV|j|d }|j	|}|d dk r| |j
ks|td}n>| |jkrd}n.| |j
|jfkrd	}n| |jkrd
}ntd|| | d }	|	S )Nr   r   r   multiprocessor_count   sm_clock_rater      i   i   dtype not supported&.>)r/   r   r   r   r   r*   r   r   r   get_device_capabilityfloat16rR   float32rc   rT   r6   )
rO   r   rP   
clock_rater/   r   num_subcores
capabilityops_per_sub_coretflopsr   r   r   get_max_tensorcore_tflopsb  s,    


r   c                     s    fdd}|S )Nc                    s   t   fdd}|S )Nc            
         s   dd l }|t  }  | k}|r|dkrtjjd }tj	d dd}d|ksht
d|d jjj}| d	j d
| d}tjddd|gd|d}	|	jdkst
ddt|	jkst
n
| | d S )Nr   zcuda-memcheck__file__PATH1)r   ZPYTORCH_NO_CUDA_MEMORY_CACHINGrequestz@memcheck'ed test must have a (possibly unused) `request` fixturez::[]Zpytestz-vsT)capture_outputenvz7cuda-memcheck returned an error: bounds checking failedzERROR SUMMARY: 0 errors)psutilProcessr   getppidnameitemsr   realpath__globals__environrR   nodeZcallspecidr|   r   r   
returncodestrr   )
rx   kwargsr   Z	ppid_nameZrun_cuda_memcheckr   r   Ztest_idr   r   )target_kwargstest_fnr   r   r     s    z1cuda_memcheck.<locals>.decorator.<locals>.wrapper)	functoolswraps)r   r   r   )r   r   	decorator  s    z cuda_memcheck.<locals>.decoratorr   )r   r   r   r   r   cuda_memcheck  s    r   c                 C   sL   d | } dddd|  dg}t|}|tjjd}dd |D }|S )	Nr   r   r   r   r	   r
   c                 S   s   g | ]}t |qS r   r   r   r   r   r   r     s     znvsmi_attr.<locals>.<listcomp>r   r   r   r   r   
nvsmi_attr  s    

r   F    c                 c   s  zt dddddg t dddd|  d	|  g t dddd
| d	| g tdgd }tdgd }t||  dk std|  dt|| dk std| dd|  }d| d }||fV  W 5 t dddddg t ddddg t ddddg X d S )Nr   r   r   z-pmz-rgcz-rmcr   z--lock-gpu-clocks=r   z--lock-memory-clocks=zclocks.current.smr   zclocks.current.memory
   zGPU SMs must run at z MHzg 3O?i   gMbP?)r   r   r   absrR   )Zref_sm_clockZref_mem_clockZcur_sm_clockZcur_mem_clockr   Zgbpsr   r   r   set_gpu_clock  s4      r   c           
      C   s   dd l }ddlm} |s tjj}|s.|j }|j|d d }|j|d }|j	 }|d dk r| |j
krxd}q| |jkrd	}qtd
n.| |j
krd}n| |j|jfkrd	}ntd
|| | d }	|	S )Nr   r   r   r   r   r   r       @   r   r   )r/   r   r   r   r   r*   r   r   r   r   r   r   r6   rc   )
rO   r   rP   r/   r   r   r   r   r   r   r   r   r   get_max_simd_tflops  s,    





r   )r"   N)rM   rN   NNTrA   )NNr[   )NN)NNN)r   r   )NN)r   r   r   r   
contextlibr   Z_C.libtriton.tritonr   r!   rL   rZ   rk   rl   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s.   	
=    
O
%>F

  