U
    9%eZ                     @   sp  d dl Z ddlmZmZmZmZmZ ddlmZ ddl	m
Z
mZ e je je jgZdd Zd	d
 Zdd Zeeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddge  dddge
edddeddd ieejejejejejejejd d!d"ZG d#d$ d$e jjZejZdS )%    N   )Configautotunecdiv
heuristicsjit)language   )early_config_pruneestimate_matmul_timec                 C   sR   | |kr| S | t kst|t ks$tt D ]$}| |kr<|  S ||kr(|   S q(d S N)_ordered_datatypesAssertionError)abd r   P/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/triton/ops/matmul.pyget_higher_dtype
   s    r   c                    s    fddS )Nc                    s   |     S r   )Zzero_)nargsnamer   r   <lambda>       zinit_to_zero.<locals>.<lambda>r   r   r   r   r   init_to_zero   s    r   c                  C   s   g } dD ]~}dD ]t}dD ]j}dD ]`}|dkr0dnd}|  t|||dd	||d
 dD ](}|  t||||d	||tdd qVq qqq| S )N)r               )       )r    @   )r    r!         r!   r   r   r	   BLOCK_MBLOCK_NBLOCK_KSPLIT_K
num_stages	num_warps)r   r      r   C)r*   r+   Zpre_hook)appendr   r   )configsr*   Zblock_mZblock_kZblock_nr+   Zsplit_kr   r   r   get_configs_io_bound   s&       r0   r"   r#   r    r$   r   r,   r)   r!   r   r   MNK
   )r
   Z
perf_modelZtop_k)r/   keyZprune_configs_byEVEN_Kc                 C   s   | d | d | d   dkS )Nr3   r'   r(   r   r   )argsr   r   r   r   L   r   r   )dot_out_dtyper%   r&   r'   GROUP_Mr(   r6   c           (      C   s  t d}t d}t ||}t ||}|| }|| }t|||  |}|| ||  }|| | }|| t d| }|| t d| }t t || ||}t t || ||}|| t d| } | |d d d f | | d d d f |   } || d d d f | |d d d f |	   }t j||f|d}!tdt ||| D ]}"|rvt 	| }#t 	|}$nb||"||   }%t jd|j
jd}&t j	| | d d d f |%k |&d}#t j	|| d d d f |%k |&d}$|#|j
j}#|$|j
j}$|!t j|#|$|d7 }!| || | 7 } ||| | 7 }qV|!|j
j}!|| t d| }|| t d| }||d d d f |
 |d d d f |   }||k d d d f ||k d d d f @ }'|dkrt j||!|'d nt j||!|'d d S )Nr   r	   )dtype)r	   r	   )maskother)Z	out_dtype)r;   )tlZ
program_idr   minZarangeZmax_contiguousZmultiple_ofZzerosrangeloadr:   Z
element_tytodotstoreZ
atomic_add)(ABr-   r1   r2   r3   Z	stride_amZ	stride_akZ	stride_bkZ	stride_bnZ	stride_cmZ	stride_cnr8   r%   r&   r'   r9   r(   r6   pidZpid_zZgrid_mZgrid_nwidthZgroup_idZ
group_sizeZpid_mZpid_nZrmZrnramZrbnZrkacckr   r   Zk_remainingZ_0r;   r   r   r   _kernel-   sL    +

,,
  ,(
rK   c                   @   s.   e Zd ZeZi Zedd ZedddZdS )_matmulc           	         s  | j }| ddkr*| ddkr*|  } |ddkrN|ddkrN| }| jd |jd ksjtd| j\ }|j\}| jtjtjtj	fks|jtjtjtj	fkrt
j}nt| j|j}t
j f||d}|d kr|t
jt
jt
jfkrtj}ntj}nJt|t
jstd|t
jkr,tj}n |t
jt
jfkrFtj}ntj} fdd}t| | || || d| d|d|d|d|d|dd	 |S )
Nr   r	   zincompatible dimensions)devicer:   z#dot_out_dtype must be a torch.dtypec                    s$   t  | d t | d  | d fS )Nr%   r&   r(   )r   )ZMETAr1   r2   r   r   r      r   z_matmul._call.<locals>.<lambda>r,   )r8   r9   )rM   Zstride
contiguousshaper   r:   r=   Zfloat8e4Zfloat8e4b15Zfloat8e5torchfloat16r   emptyfloat32bfloat16Zint32
isinstancerK   )	r   r   r8   rM   r3   _Zc_dtypecgridr   rN   r   _call   sH    


   z_matmul._callNc                 C   s   t j|||dS )N)r8   )rL   rZ   )ctxr   r   r8   r   r   r   forward   s    z_matmul.forward)N)	__name__
__module____qualname__rK   ZkernelZ_locksstaticmethodrZ   r\   r   r   r   r   rL      s   
)rL   )rQ    r   r   r   r   r   r   r=   Zmatmul_perf_modelr
   r   rR   rU   rT   r   r   r   r0   Z	constexprrK   ZautogradFunctionrL   applymatmulr   r   r   r   <module>   sd        84