U
    9%e?                  
   @  s*  d dl mZ d dlZd dlmZ d dlmZmZmZm	Z	m
Z
 ddlmZ ddlmZ e
d	ZG d
d deZddddddZddddddZddddddZdddddddZdddddddZddddd"d#d$d%Zddddd&d'd(Zddddd&d)d*Zddddd&d+d,Zddddd&d-d.Zddddd&d/d0Zdddddd1d2d3Zddddd&d4d5Zdddd"d&d6d7Z ddddd&d8d9Z!ddddd&d:d;Z"ddddd&d<d=Z#ddddd&d>d?Z$ddddd&d@dAZ%dddBdCdDZ&ddddd&dEdFZ'ddddd&dGdHZ(ddddd&dIdJZ)dddKdLdMZ*ddddNdOdPZ+ddddNdQdRZ,ddSdTdUdVZ-ddddd&dWdXZ.ddddd&dYdZZ/ddddd&d[d\Z0ddddd&d]d^Z1ddddd&d_d`Z2ddddd&dadbZ3dddddcdddeZ4dfddddgdhdiZ5ddfdddjdkdlZ6ddfdddjdmdnZ7dddddodpdqZ8ddddddrdsdtZ9ddddNdudvZ:ddfdddwdxdyZ;ddddd#dzd{Z<ddddd|d}d~Z=dd Z>ddddd|ddZ?dd Z@dd ZAdd ZBdd ZCdd ZDdd ZEdd ZFdd ZGdddddddddd	ddZHdd ZIdd ZJddddddddddZKdddddddddZLdddddddddZMdddddddddZNdddddddddZOdddddddddZPdddddddddZQdddddddddZRdddddddddZSdddddddddZTdddddddddZUddddddddZVdddddddZWdddddddZXdd	dÜddńZYdddddƜddȄZZeYddgdˍdddd̜dd΄Z[eYddgdˍdddd̜ddЄZ\eYddgdˍdddd̜dd҄Z]eYddgdˍdddd̜ddԄZ^eYddgdˍdddd̜ddքZ_eYddgdˍdddd̜dd؄Z`dddd̜ddڄZaddfddۜdd݄Zbddfddۜdd߄ZcddfddۜddZddddddZedddddddZfdddddddddZgdd ZhdddZiddddddZjddddddZkdS )    )annotationsNwraps)ListOptionalSequenceTupleTypeVar   )ir   )coreTc                      s   e Zd Z fddZ  ZS )IncompatibleTypeErrorImplc                   s@   || _ || _d| j   d | j  | _tt| | j d S )Nzinvalid operands of type  and )type_atype_b__repr__messagesuperr   __init__)selfr   r   	__class__ W/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/triton/language/semantic.pyr      s    z"IncompatibleTypeErrorImpl.__init__)__name__
__module____qualname__r   __classcell__r   r   r   r   r      s   r   intz
ir.builderz	tl.tensor)axisbuilderreturnc                 C  s   t || t jS N)tltensorZcreate_get_program_idint32r!   r"   r   r   r   
program_id   s    r)   c                 C  s   t || t jS r$   )r%   r&   Zcreate_get_num_programsr'   r(   r   r   r   num_programs   s    r*   ztl.dtype)a_tyb_tyr#   c                 C  sx   | j }|j }| j}|j}||kr0||kr,| S |S |tjjjkrN||krJ| S |S |tjjjkrl||krh|S | S dsttd S NF)int_bitwidthint_signednessr%   dtypeZ
SIGNEDNESSZUNSIGNEDAssertionError)r+   r,   Za_rankZb_rankZa_snZb_snr   r   r   integer_promote_impl'   s    r2   bool)r+   r,   
div_or_modr#   c                 C  s   |   s|  rtjS |  s&| r,tjS |  s<| rL|rFtjS tjS |  s\| r|rftjS |  r|| r|tjS tjS | 	 r|	 sdst
|r| j|jkrtd|   d |  d t| |S )NFzCannot use /, #, or % with r   x because they have different signedness;this is unlikely to result in a useful answer. Cast them to the same signedness.)Zis_fp64r%   Zfloat64is_fp32float32is_fp16float16is_bf16bfloat16is_intr1   r/   
ValueErrorr   r2   )r+   r,   r4   r   r   r   computation_type_impl7   s&     r>   None)r   r   allow_ptr_ar#   c                 C  sF   |   rB|st| ||  r0| |kr0t| || rBt| |d S r$   )is_ptrr   is_floating)r   r   r@   r   r   r   check_ptr_type_impl]   s    

rC   FTzTuple[tl.tensor, tl.tensor])lhsrhsr"   r#   c           
      C  sx   t | ||\} }| jj}|jj}t||| t||| |rp| sp| spt|||}	t| |	|} t||	|}| |fS r$   )broadcast_impl_valuetypescalarrC   rA   r>   cast)
rD   rE   r"   Zallow_lhs_ptrZallow_rhs_ptrZarithmetic_checkr4   Z
lhs_sca_tyZ
rhs_sca_ty
ret_sca_tyr   r   r   binary_op_type_checking_impli   s    rK   )inputotherr"   r#   c                 C  s   t | ||dd\} }| jj}|jj}| r>| s>||  } }| r`t|| j|j| jS | rt|	| j|j| jS |
 rt|| j|j| jS dstd S NTF)rK   rG   rH   rA   r%   r&   create_addptrhandlerB   Zcreate_faddr<   Z
create_addr1   rL   rM   r"   input_scalar_tyother_scalar_tyr   r   r   add}   s    
rT   c                 C  s   t | ||dd\} }| jj}| rDt|| jt||j| jS |	 rft|
| j|j| jS | rt|| j|j| jS dstd S rN   )rK   rG   rH   rA   r%   r&   rO   rP   minusrB   Zcreate_fsubr<   Z
create_subr1   rL   rM   r"   	scalar_tyr   r   r   sub   s    rX   c                 C  sh   t | ||\} }| jj}| r:t|| j|j| jS | r\t|	| j|j| jS dsdt
d S r-   )rK   rG   rH   rB   r%   r&   Zcreate_fmulrP   r<   Z
create_mulr1   rV   r   r   r   mul   s    rY   c                 C  s   t | ||dddd\} }| jj}|jj}| rF| rFt|||}n| rd| rdt| ||} nn| r| rt| tj|} t|tj|}n@| r| r|j|jkrt|||}qt| ||} ndst	t
|| j|j| jS NFT)rK   rG   rH   rB   r<   rI   r%   r7   Zfp_mantissa_widthr1   r&   create_fdivrP   rQ   r   r   r   truediv   s     r\   c                 C  s   t | ||dddd\} }| jj}|jj}| r| rt||}t| ||} t|||}| r|t|	| j
|j
| jS t|| j
|j
| jS dstd S rZ   )rK   rG   rH   r<   r2   rI   is_int_signedr%   r&   Zcreate_sdivrP   Zcreate_udivr1   )rL   rM   r"   rR   rS   ret_tyr   r   r   floordiv   s    
r_   )rL   rM   ieee_roundingr"   r#   c                 C  s^   | j j}|j j}| r | s(tdt| ||dddd\} }|| j|j}t|| j S )Nz4both operands of fdiv must have floating scalar typeFT)	rG   rH   rB   r=   rK   r[   rP   r%   r&   )rL   rM   r`   r"   rR   rS   retr   r   r   fdiv   s    rb   c              	   C  s   t | ||dddd\} }| jj}|jj}| rXt| ttt| |d|||||}|S | r|j	|j	krt
d|  d |  d | rt|| j|j| jS t|| j|j| jS dstd S )NFTzCannot mod z by r5   )rK   rG   rH   rB   rX   rY   floorrb   r<   r/   r=   r   r]   r%   r&   Zcreate_sremrP   Zcreate_uremr1   )rL   rM   r"   rW   rS   ra   r   r   r   mod   s$      rd   c                 C  sz   t | ||ddd\} }| jj}|jj}| r6| s@t||t||}||kr^t| ||} ||krrt|||}| |fS r-   )rK   rG   rH   r<   r   r2   rI   )rL   rM   r"   input_sca_tyZother_sca_tyrJ   r   r   r   bitwise_op_type_checking_impl  s    

rf   c                 C  s*   t | ||\} }t|| j|j| jS r$   )rf   r%   r&   Z
create_andrP   rG   rL   rM   r"   r   r   r   and_  s    rh   c                 C  s*   t | ||\} }t|| j|j| jS r$   )rf   r%   r&   Z	create_orrP   rG   rg   r   r   r   or_"  s    ri   c                 C  s*   t | ||\} }t|| j|j| jS r$   )rf   r%   r&   Z
create_xorrP   rG   rg   r   r   r   xor_)  s    rj   c                 C  sD   | j  st| td|} |j  s8t|td|}t| ||S Nint1)rG   is_int1bitcastr%   r0   rh   rg   r   r   r   logical_and0  s
    

ro   c                 C  sD   | j  st| td|} |j  s8t|td|}t| ||S rk   )rG   rm   rn   r%   r0   ri   rg   r   r   r   
logical_or8  s
    

rp   rL   r"   c                 C  s&   | j  st| td|} t| |S rk   )rG   rm   rn   r%   r0   invertrq   r   r   r   not_@  s    
rs   c                 C  s*   t | ||\} }t|| j|j| jS r$   )rf   r%   r&   Zcreate_lshrrP   rG   rg   r   r   r   lshrF  s    rt   c                 C  s*   t | ||\} }t|| j|j| jS r$   )rf   r%   r&   Zcreate_ashrrP   rG   rg   r   r   r   ashrM  s    ru   c                 C  s*   t | ||\} }t|| j|j| jS r$   )rf   r%   r&   Z
create_shlrP   rG   rg   r   r   r   shlT  s    rv   )rL   r#   c                 C  s   | S r$   r   )rL   r   r   r   plus_  s    rw   )rL   r"   r#   c                 C  sH   | j j}| r$td|  d t||||}t	|| |S )Nz$wrong type argument to unary minus ())
rG   rH   rA   r=   r   r%   r&   get_null_valueto_irrX   )rL   r"   re   _0r   r   r   rU   c  s
    rU   c                 C  sP   | j j}| s| r,td|  d t||	||}t
| ||S )Nz%wrong type argument to unary invert (rx   )rG   rH   rA   rB   r=   r   r%   r&   Zget_all_ones_valuerz   rj   )rL   r"   re   Z_1r   r   r   rr   l  s
    rr   ztl.block_type)vr#   c                 C  s&   | j  stjS | j j}ttj|S r$   )rG   is_blockr%   rl   shape
block_type)r|   r~   r   r   r   
_bool_likex  s    
r   c                 C  s   t | ||\} }| jj}| r<t|| j|jt| S |	 r|
 rht|| j|jt| S t|| j|jt| S dstd S r-   )rK   rG   rH   rB   r%   r&   Zcreate_fcmpOGTrP   r   r<   r]   Zcreate_icmpSGTZcreate_icmpUGTr1   rV   r   r   r   greater_than  s    r   c                 C  s   t | ||\} }| jj}| r<t|| j|jt| S |	 r|
 rht|| j|jt| S t|| j|jt| S dstd S r-   )rK   rG   rH   rB   r%   r&   Zcreate_fcmpOGErP   r   r<   r]   Zcreate_icmpSGEZcreate_icmpUGEr1   rV   r   r   r   greater_equal  s    r   c                 C  s   t | ||\} }| jj}| r<t|| j|jt| S |	 r|
 rht|| j|jt| S t|| j|jt| S dstd S r-   )rK   rG   rH   rB   r%   r&   Zcreate_fcmpOLTrP   r   r<   r]   Zcreate_icmpSLTZcreate_icmpULTr1   rV   r   r   r   	less_than  s    r   c                 C  s   t | ||\} }| jj}| r<t|| j|jt| S |	 r|
 rht|| j|jt| S t|| j|jt| S dstd S r-   )rK   rG   rH   rB   r%   r&   Zcreate_fcmpOLErP   r   r<   r]   Zcreate_icmpSLEZcreate_icmpULEr1   rV   r   r   r   
less_equal  s    r   c                 C  sl   t | ||\} }| jj}| r<t|| j|jt| S |	 r`t|
| j|jt| S dshtd S r-   )rK   rG   rH   rB   r%   r&   Zcreate_fcmpOEQrP   r   r<   Zcreate_icmpEQr1   rV   r   r   r   equal  s    r   c                 C  sl   t | ||\} }| jj}| r<t|| j|jt| S |	 r`t|
| j|jt| S dshtd S r-   )rK   rG   rH   rB   r%   r&   Zcreate_fcmpUNErP   r   r<   Zcreate_icmpNEr1   rV   r   r   r   	not_equal  s    r   )startendr"   r#   c                 C  s   t | trt |tstdt| d? }t|d? }|s<|rDtd|| krTtd||  g}ttj|}t|| ||S )Nz/arange's arguments must be of type tl.constexpr    zarange must fit in int32z=arange's end argument must be greater than the start argument)	
isinstancer    r=   r3   r%   r   r'   r&   Zcreate_make_range)r   r   r"   Zis_start_int64Zis_end_int64r~   r^   r   r   r   arange  s    
r   z	List[int])r~   r0   r"   r#   c                 C  s   t |tjrP|jjdks tdt|||}t|j| }t|	|j
| |S |dkrj|||}nt|d|j }||}|d krtdt|| }t|	|| |S d S )Nr   zonly accepts size-1 tensorr   get_z2dtype must be specified when value is not a tensor)r   r%   r&   numelvaluer1   rI   r   r0   create_splatrP   ry   rz   getattrnamer=   )r~   r   r0   r"   r^   Zget_value_fnr   r   r   full  s    r   )rL   	dst_shaper"   r#   c                 C  sP   d}|D ]}||9 }q| j j|kr*tdt| j j|}t|| j||S )Nr   z$cannot view block of different shape)	rG   r   r=   r%   r   rH   r&   Zcreate_viewrP   )rL   r   r"   r   sr^   r   r   r   view
  s    
r   c                 C  s   t dd S )Nz`reshape` is not supported yet. Please use `view` instead if applicable. Note that view may reorder elements in an implementation- and context- dependent way.)r=   )rL   r   r"   r   r   r   reshape  s    r   )rL   r!   r"   r#   c                 C  s>   t | jj}||d t| jj|}t|| j	||S )Nr   )
listrG   r~   insertr%   r   rH   r&   create_expand_dimsrP   )rL   r!   r"   r   r^   r   r   r   expand_dims   s    r   )rD   rE   can_reorderr"   r#   c                 C  sX   |st dt| jdkst t| jj| jd |jd  g}t|| j	|j	|S )Nz;current implementation of `cat` always may reorder elementsr   r   )
r1   lenr~   r%   r   rG   rH   r&   Z
create_catrP   )rD   rE   r   r"   ret_typer   r   r   cat'  s    "r   c                 C  sJ   t | jdkrtdt| jj| jd | jd g}t|| j	|S )Nr
   z!Only 2D tensors can be transposedr   r   )
r   r~   r=   r%   r   rG   rH   r&   Zcreate_transrP   )rL   r"   r   r   r   r   trans.  s     r   )rL   r~   r"   r#   c                 C  s   | j  s.t| j |}t|| j||S | j  }t|t|kr\t	d| d| ||krh| S t
|D ]F\}}|| |krp|dkrpt	d||  d| d| d| d| 
qpt| j j|}t|| j||S )Nz!Cannot broadcast, rank mismatch: z, r   z3Cannot broadcast, the expanded size of the tensor (z ) must match the existing size (z) at non-singleton dimension : )rG   r}   r%   r   r&   r   rP   get_block_shapesr   r=   	enumeraterH   create_broadcast)rL   r~   r"   r^   Z	src_shapeiitemr   r   r   broadcast_impl_shape5  s    

,r   c              	   C  sp  | j }|j }| rJ| sJt|j|j}t||j|	 |}n| s| rt|j|j}t|| j|	 |} n| rh| rh|	 }|	 }t
|t
|k rtt
|t
|D ]8}t|| jdt|jdg| } | j }|	 }qn`t
|t
|k rltt
|t
|D ]:}t||jdt|jdg| }|j }|	 }q0t
|t
|kstg }t|D ]|\}	}
||	 }|
dkr|| nT|dkr||
 n>|
|kr||
 n(tdt|	 d t|
 d t| q||kr:t|j|}t|| j||} ||krht|j|}t||j||}| |fS )Nr   r   z?Cannot make_shape_compatible: incompatible dimensions at index r   r   )rG   r}   r%   r   rH   r~   r&   r   rP   r   r   ranger   r1   r   appendr=   strr   )rD   rE   r"   Zlhs_tyZrhs_tyZ	lhs_shapeZ	rhs_shapedim	ret_shaper   leftrightr^   r   r   r   rF   I  sb    &&





rF   )rL   dst_tyr"   r#   c                 C  s   | j }| r"t|j| j  }||kr.| S |j}|j}| sJ| rVt| ||S |j}|j}||krt	dt
| d t
| t|| j|||S )Nz!Cannot bitcast data-type of size z to data-type of size )rG   r}   r%   r   rH   r   rA   rI   primitive_bitwidthr=   r   r&   create_bitcastrP   rz   )rL   r   r"   src_ty
src_sca_ty
dst_sca_tyZsrc_bitsZdst_bitsr   r   r   rn     s$    rn   c                 C  s
   t | tS r$   r   r    )archr   r   r   _is_cuda  s    r   c                 C  s  | j }t|tjr|j}| r4t|j| j  }||kr@| S |j}|j}t	|j
r||j
dk r|| sp| r|tdt | r| s| r| rt|| j|||S | r| r| r| stt| tj|||S | o| o|j|jk}|r0t|| j|||S | oN| oN|j|jk }|rrt|| j|||S | r
| r
|j|jks|j|jkr
|  o|!  }|! r| j"|}	t|#|	| j"}
t$| |
|S t|%| j||||S |& r| r|! rT| j"|}	t|#|	| j"}
t$| |
|S |  rzt|'| j|||S t|(| j|||S | r|& r|! s|  st|)| j|||S t|*| j|||S |+ rf| rf|j}|dkr6t|,| j|||S |dkrft$t| tj-|t|.dtj-|S | r|+ rt|/| j|||S |+ r|+ rt|0| j|||S dst1d|  d| d S )	NY   zUStandard tl.float8e4 format will be deprecated on SM < 89. Please use tl.float8e4b15.@   r   r   Fzcannot cast z to )2rG   r   r%   	constexprr   r}   r   rH   r   r   r   Zis_fp8e4warningswarnDeprecationWarningZis_fp8rB   r&   Zcreate_fp_to_fprP   rz   r8   r6   r:   rI   r7   r   Zcreate_fp_truncZcreate_fp_extr<   r.   r/   r]   is_boolr0   ry   r   create_int_castZis_standard_floatingZcreate_fp_to_siZcreate_fp_to_uiZcreate_ui_to_fpZcreate_si_to_fprA   Zcreate_ptr_to_intint64	get_int64Zcreate_int_to_ptrr   r1   )rL   r   r"   r   r   r   Ztruncate_fpZext_fpZsign_extendtyr{   Zbitwidthr   r   r   rI     s    






 



rI   c                 C  sD   t jj}| r@| dkrt jj}n"| dkr0t jj}ntd|  d|S )Nz.ca.cgCache modifier  not supported)r   CACHE_MODIFIERNONECACGr=   cache_modifiercacher   r   r   _str_to_load_cache_modifier  s    

r   c                 C  sh   t jj}| rd| dkrt jj}nF| dkr0t jj}n4| dkrBt jj}n"| dkrTt jj}ntd|  d|S )Nz.wbr   z.csz.wtr   r   )r   r   r   ZWBr   CSZWTr=   r   r   r   r   _str_to_store_cache_modifier!  s    



r   c                 C  sD   t jj}| r@| dkrt jj}n"| dkr0t jj}ntd|  d|S )NZ
evict_lastZevict_firstzEviction policy r   )r   ZEVICTION_POLICYZNORMALZ
EVICT_LASTZEVICT_FIRSTr=   )eviction_policyevictionr   r   r   _str_to_eviction_policy1  s    

r   c                 C  s@   d }| r<| dkrt jj}n"| dkr,t jj}ntd|  d|S )NzeronanzPadding option r   )r   PADDING_OPTIONZPAD_ZEROPAD_NANr=   )padding_optionpaddingr   r   r   _str_to_padding_option=  s    

r   c                 C  sh   t jj}| rd| dkrt jj}nF| dkr0t jj}n4| dkrBt jj}n"| dkrTt jj}ntd|  d|S )NacquirereleaseZacq_relrelaxedzMemory semantic r   )r   ZMEM_SEMANTICZACQUIRE_RELEASEZACQUIREZRELEASEZRELAXEDr=   )Z
sem_optionsemr   r   r   _str_to_semI  s    



r   c                 C  s   | rt | ds| g} dd | D } | D ],}t|trNd|  krLt|k s&n tq&t| dksdtt| tt| kstdt| S t S )N__iter__c                 S  s"   g | ]}t |tjr|jn|qS r   r   r%   r   r   .0elemr   r   r   
<listcomp>]  s     z0_canonicalize_boundary_check.<locals>.<listcomp>r   z'Duplicate dimension in `boundary_check`)hasattrr   r    r   r1   setsortedtuple)boundary_checkblock_shaper   r   r   r   _canonicalize_boundary_checkY  s    
*r   c	              
   C  s|   |s|rt d| jjj}	|	tjks,td|	 rH|tjj	krHt d| jj}
t
||
 }t|| j||||||
S )NK`mask` and `other` arguments cannot be specified for loading block pointers3`tl.int1` should be rewrited in `tl.make_block_ptr`z@Padding option `nan` is not supported for integer block pointers)r=   rG   
element_tyr%   rl   r1   r<   r   r   r   r   r   r&   Zcreate_tensor_pointer_loadrP   )ptrmaskrM   r   r   r   r   is_volatiler"   elt_tyr   r   r   r   _load_block_pointerf  s    
r   c	              
   C  sn  | j j s"td| j   d|s2|r2td|s:|rBtd| j  sx|rb|j  rbtd|rx|j  rxtd| j  r|rt|| j  |}|rt|| j  |}| j j}	|	j}
|
t	j
krt	j}
t	|
|	j}	t| |	|} |rt||
|}| j  r| j  }t	|
|}n|
}|s>t	|| j||||S t	|| j|j|rZ|jnd ||||S d S )NUnsupported ptr type z in `tl.load`z)`other` cannot be provided without `mask`z`padding_option` or `boundary_check` argument is not supported for loading a tensor ofpointers or loading a scalar. Because the compiler does not know the boundary; please use block pointers (defined by `make_block_ptr`) insteadEMask argument cannot be block type if pointer argument is not a blockzFOther argument cannot be block type if pointer argument is not a block)rG   rH   rA   r=   r   r}   r   r   r   r%   rl   int8pointer_typeaddress_spacerI   r   r&   Zcreate_loadrP   Zcreate_masked_load)r   r   rM   r   r   r   r   r   r"   ptr_tyr   r~   r   r   r   r   _load_legacy|  sH    



  r   zOptional[tl.tensor]r   )	r   r   rM   r   r   r   r   r"   r#   c	              
   C  sb   t |}	t|}
t|}| j rF| jj rFt| |||||	|
||	S t| |||||	|
||	S d S r$   )	r   r   r   rG   rA   r   r}   r   r   )r   r   rM   r   r   r   r   r   r"   r   r   r   r   r   r   load  s    
r   c           	   	   C  s   |rt d| jj }|j s.t|||}|j s@td||j ksVtd| jjj|jjksptd| jjj}|tjkstdt	||}t
|| j|j|||tjS )Nr   z-Value argument must be block type or a scalarz$Block shape and value shape mismatchz2Block element type and value element type mismatchr   )r=   rG   r   r   r}   r   r1   r%   rl   r   r&   Zcreate_tensor_pointer_storerP   void)	r   valr   r   r   r   r"   r   r   r   r   r   _store_block_pointer  s    


r   c           	   	   C  s0  | j j s"td| j   d|r.td| j  s`|j  rJtd|r`|j  r`td| j  rt|| j  |}|rt|| j  |}| j j}|j}|t	j
krt	j}t	||j}t| ||} t|||}|st	|| j|j||t	jS |j j stdt	|| j|j|j||t	jS )Nr   z in `tl.store`z`boundary_check` argument is not supported for storing a tensor of pointers or storing a scalar. Because the compiler does not know the boundary; please use block pointers (defined by `make_block_ptr`) insteadzFValue argument cannot be block type if pointer argument is not a blockr   z"Mask must have boolean scalar type)rG   rH   rA   r=   r   r}   r   r   r   r%   rl   r   r   r   rI   r&   Zcreate_storerP   r   r   Zcreate_masked_store)	r   r   r   r   r   r   r"   r   r   r   r   r   _store_legacy  s2    



r   )r   r   r   r   r   r"   r#   c           	      C  sR   t |}t|}| j r:| jj r:t| ||||||S t| ||||||S d S r$   )r   r   rG   rA   r   r}   r   r   )	r   r   r   r   r   r   r"   r   r   r   r   r   store  s
    r   )r   cmpr   r   r"   r#   c                 C  sD   t |}| jjj}|jdkr$tdt|| j	|j	|j	||jS )N)   r   r   z9atomic_cas only supports elements with width {16, 32, 64})
r   rG   rH   r   r   r=   r%   r&   Zcreate_atomic_casrP   )r   r   r   r   r"   r   r   r   r   
atomic_cas(  s
    

r  z&Tuple[tl.tensor, tl.tensor, tl.tensor])r   r   r   opr"   r#   c                 C  s   | j j std| j   | j jj}|tjkrJ|dkrJtd| d |tjtj	tj
tjfkrztd| d t| | j  r|rt|| j  |}|rt|| j  |}t|| j jj|}|s|d}tj}| j  r
||| j  }ttj| j  }t||}| ||fS )Nz)Pointer argument of store instruction is rT   Zatomic_z does not support fp16z does not support T)rG   rH   rA   r=   r   r   r%   r9   rl   r   Zint16r;   r   r}   r   r   rI   Zget_int1r   r   r&   )r   r   r   r  r"   r   Zmask_irZmask_tyr   r   r   atom_red_typechecking_impl4  s*    


r  )r   r   r   r   r"   r#   c                 C  sD  t | ||d|\} }}t|}|jj}| r| r\t|t	j
j| j|j|j||jS t|t	j
j| j|j|j||jS t|tj|}t| ttjd|}t|t|d||}t|t|d||}	t|t	j
j|j|jt|||j||j}
t|t	j
j|j|jt||	|j||j}t||
||S )Nmaxr   r   )r  r   rG   rH   r<   r]   r%   r&   create_atomic_rmwr   	ATOMIC_OPMAXrP   UMAXrn   r'   r   r   get_fp32r   rh   UMINwherer   r   r   r   r"   sca_tyZi_valZi_ptrposnegZpos_retZneg_retr   r   r   
atomic_maxP  s8    	..r  c                 C  sD  t | ||d|\} }}t|}|jj}| r| r\t|t	j
j| j|j|j||jS t|t	j
j| j|j|j||jS t|tj|}t| ttjd|}t|t|d||}t|t|d||}	t|t	j
j|j|jt|||j||j}
t|t	j
j|j|jt||	|j||j}t||
||S )Nminr   r   )r  r   rG   rH   r<   r]   r%   r&   r  r   r  ZMINrP   r  rn   r'   r   r   r
  r   rh   r	  r  r  r   r   r   
atomic_mint  sT    	r  c              	   C  s`   t | ||d|\} }}t|}|jj}| r6tjjntjj}t	
||| j|j|j||jS )NrT   )r  r   rG   rH   rB   r   r  ZFADDZADDr%   r&   r  rP   )r   r   r   r   r"   r  r  r   r   r   
atomic_add  s
    r  c              	   C  sD   t | ||d|\} }}t|}t|tjj| j|j|j||j	S )Nand)
r  r   r%   r&   r  r   r  ANDrP   rG   r   r   r   r   r"   r   r   r   
atomic_and  s    r  c              	   C  sD   t | ||d|\} }}t|}t|tjj| j|j|j||j	S )Nor)
r  r   r%   r&   r  r   r  ORrP   rG   r  r   r   r   	atomic_or  s    r  c              	   C  sD   t | ||d|\} }}t|}t|tjj| j|j|j||j	S )Nxor)
r  r   r%   r&   r  r   r  ZXORrP   rG   r  r   r   r   
atomic_xor  s    r  c              	   C  sD   t | ||d|\} }}t|}t|tjj| j|j|j||j	S )NZxchg)
r  r   r%   r&   r  r   r  ZXCHGrP   rG   r  r   r   r   atomic_xchg  s    r  )rD   rE   
allow_tf32	out_dtyper"   r#   c           
   
   C  s  | j  r|j  st| j|jks>td| j d|j dt| jdks^td| j dt|jdks~td|j d| jd j|jd	 jkstd| j d
|j d| jd j d|jd	 j d	| jd	 jdkr| jd jdkr|jd jdkstd| j d|j d| j j rj| j jt	j
ks>td| jd jdksXtd|d	}t	j}nP| j j s| j j r|d	}t	j}n"| r|d	n|d	}|}| j jd	 }|j jd }||||g}t	|||g}	t	|| j|j|||	S )NzFirst input (z) and second input (z) must have the same dtype!r
   zFirst input shape (z) is not two dimensional!zSecond input shape (r   r   z) and second input shape z= are not compatible for matmul (second index of first shape (z0) must be equal to first index of second shape (rx   r  z&All values in both first input shape (z) and second input shape (z) must be >= 16!zonly int8 supported!r   zsmall blocks not supported!)rG   r}   r1   r0   r   r~   r   rH   r<   r%   r   	get_int32r'   r6   r:   r
  r7   r8   Zget_fp16r   r   r&   Z
create_dotrP   )
rD   rE   r  r   r"   r{   Zret_scalar_tyMNr^   r   r   r   dot  s6    &  N 

r$  )	conditionxyr"   r#   c                 C  s   t | tj|} | j rHt| ||\} }t|||\}}t| ||\} }t|||dd\}}| j svt| ||\} }|j}t|| j	|j	|j	|S )NT)
rI   r%   rl   rG   r}   rF   rK   r&   Zcreate_selectrP   )r%  r&  r'  r"   _r^   r   r   r   r     s    

r  zSequence[tl.tensor]zTuple[tl.tensor, ...])inputsr!   r"   r#   c           	        s    d krNg }t tD ](}| jjg}|t| || qt|d d jj} fddt	|D D ]}|jj|kstt
qtfdd|dd D  |   tfddt tD S )	Nr   c                   s   g | ]\}}| kr|qS r   r   )r   r   r   )r!   r   r   r   !  s      zreduction.<locals>.<listcomp>c                   s"    rt | }n|}t | |S r$   r%   r   r&   r&  rW   Zres_ty)r   r   r   wrap_tensor%  s    zreduction.<locals>.wrap_tensorc                 S  s   g | ]
}|j qS r   rP   r   tr   r   r   r   -  s     c                 3  s&   | ]} | | jjV  qd S r$   Z
get_resultrG   rH   r   r   )r)  	reduce_opr,  r   r   	<genexpr>1  s   zreduction.<locals>.<genexpr>)r   r   r   r   r   r   r   rG   r~   r   r1   Zcreate_reduceverify)	r)  r!   region_builder_fnr"   Z
new_inputsr   Z	new_shaper~   r/  r   )r!   r)  r2  r   r,  r   	reduction  s$    
r6  c                   st   t  dkrtd d jjfdd|dd  D ||   t fdd	tt  D S )
Nr   z7Current implementation only support single tensor inputr   c                   s   t | }t | |S r$   r*  r+  )r~   r   r   r,  C  s    z%associative_scan.<locals>.wrap_tensorc                 S  s   g | ]
}|j qS r   r-  r.  r   r   r   r   G  s     z$associative_scan.<locals>.<listcomp>c                 3  s&   | ]} | | jjV  qd S r$   r0  r1  )r)  scan_opr,  r   r   r3  K  s   z#associative_scan.<locals>.<genexpr>)r   r=   rG   r~   Zcreate_scanr4  r   r   )r)  r!   r5  r"   r   )r)  r7  r~   r,  r   associative_scan<  s    
r8  z	List[str])dtypesr#   c                   s    fdd}|S )ai  
    We following libdevice's convention to check accepted data types for math functions.
    It is not a good practice to support all data types as accelerators/GPUs don't support
    many float16 and bfloat16 math operations.
    We should let the users know that they are using and invoke explicit cast to convert
    the data type to the supported one.
    c                   s   t   fdd}|S )Nc                    sZ   t | t |  }dd |D D ],}|jjj kr"td  d|jjj q"| |S )Nc                 S  s   g | ]}t |tjr|qS r   )r   r%   r&   )r   ar   r   r   r   b  s      z@_check_dtype.<locals>.wrapper.<locals>.check.<locals>.<listcomp>zExpected dtype z	 but got )r   valuesrG   rH   r   r=   )argskwargsZall_argsarg)r9  fnr   r   check^  s
    z,_check_dtype.<locals>.wrapper.<locals>.checkr   )r?  r@  r9  )r?  r   wrapper]  s    z_check_dtype.<locals>.wrapperr   )r9  rB  r   rA  r   _check_dtypeU  s    rC  )r&  r'  r"   r#   c                 C  s,   t | ||\} }ddlm} |j| ||dS Nr   )math)Z_builder)rK    rE  Zmulhi)r&  r'  r"   rE  r   r   r   umulhik  s    rG  Zfp32Zfp64rA  )r&  r"   r#   c                 C  s   ddl m} |j| |dS rD  )rF  rE  rc   )r&  r"   rE  r   r   r   rc   r  s    rc   c                 C  s   t || j| jS r$   )r%   r&   Z
create_exprP   rG   r&  r"   r   r   r   expy  s    rI  c                 C  s   t || j| jS r$   )r%   r&   Z
create_logrP   rG   rH  r   r   r   log~  s    rJ  c                 C  s   t || j| jS r$   )r%   r&   Z
create_cosrP   rG   rH  r   r   r   cos  s    rK  c                 C  s   t || j| jS r$   )r%   r&   Z
create_sinrP   rG   rH  r   r   r   sin  s    rL  c                 C  s   t || j| jS r$   )r%   r&   Zcreate_sqrtrP   rG   rH  r   r   r   sqrt  s    rM  c                 C  sd   | j }| r$t|| j| jS | rBt|| j| jS |	 rN| S ds`t
d| d S )NFzUnexpected dtype )r0   rB   r%   r&   Zcreate_fabsrP   rG   r]   Zcreate_iabsZis_int_unsignedr1   )r&  r"   r0   r   r   r   abs  s    rN  )r&  r;  r#   c                 C  s:   t | jt |krtd| jdt|| j  | S )NzAShape of input to multiple_of does not match the length of valuesztt.divisibilityr   r~   r=   rP   Zset_attrr   Z	make_attrZget_contextr&  r;  r   r   r   multiple_of  s    rQ  c                 C  s:   t | jt |krtd| jdt|| j  | S )NzDShape of input to max_contiguous does not match the length of valuesztt.contiguityrO  rP  r   r   r   max_contiguous  s    rR  c                 C  s:   t | jt |krtd| jdt|| j  | S )NzCShape of input to max_constancy does not match the length of valuesztt.constancyrO  rP  r   r   r   max_constancy  s    rS  )r"   r#   c                 C  s   t |  t jS r$   )r%   r&   Zcreate_barrierr   )r"   r   r   r   debug_barrier  s    rT  zList[tl.tensor])prefixr<  r"   r#   c                 C  s0   g }|D ]}| |j qt|| |tjS r$   )r   rP   r%   r&   Zcreate_printr   )rU  r<  r"   new_argsr>  r   r   r   device_print  s    rW  )condmsg	file_namelinenor"   r#   c              	   C  sP   | j }| s2t|jd}t|| jd|} t|| j||||tj	S )N)r   )
rG   r}   r%   r   rH   r&   r   rP   Zcreate_assertr   )rX  rY  rZ  	func_namer[  r"   Zcond_tyr   r   r   device_assert  s
    r]  c                 C  s   t |tjr(|r| |jS | |jS t |tjr|jjdksHtd|j	
 sZtd|j	tjkr|r| |j|  |j	 S |j	tjkr| |j|  |j	 S |jS dstdt| d S )Nr   z*Expected a scalar in shape/strides/offsetsz8Expected an integer scalar type in shape/strides/offsetsFz3Unsupported element type in shape/strides/offsets: )r   r%   r   r   r   r!  r&   r   r1   r0   r<   r   r   rP   Zget_int64_tyr]   r'   Zget_int32_tyrG   )r"   r   require_i64r   r   r   _convert_elem_to_ir_value  s    r_  c                   s,   t |dr fdd|D S t |gS )Nr   c                   s   g | ]}t  |qS r   )r_  r   r"   r^  r   r   r     s     z)_convert_to_ir_values.<locals>.<listcomp>)r   r_  )r"   	list_liker^  r   r`  r   _convert_to_ir_values  s    
rb  )baser"   r#   c              	     s:  t ||}t ||}t ||dd}| j r8| jj r@td| jjtjkrht| t	tj
| jj|} t dsx g dd  D  tdd  D stdt|ds|g}d	d |D }t|ttt|kstd
t fdd||||fD std|| j||| |}t|t	t| jj S )NFr^  zMExpected `base` to be a pointer type (but not a block pointer type or others)r   c                 S  s"   g | ]}t |tjr|jn|qS r   r   r   r   r   r   r     s     z"make_block_ptr.<locals>.<listcomp>c                 S  s.   g | ]&}t |to(d |  ko$dk n  qS )i   l        r   r   r   r   r   r     s     zGExpected a list of constant integers (`int32_t` range) in `block_shape`c                 S  s"   g | ]}t |tjr|jn|qS r   r   r   r   r   r   r     s     z<Expected a permutation of (0, 1, ..., len(order)-1) in orderc                   s   g | ]}t  t |kqS r   )r   )r   ra  r   r   r   r     s     zBExpected shape/strides/offsets/block_shape to have the same length)rb  rG   rA   r   r}   r=   r%   rl   rI   r   r   r   r   allr1   r   r   r   r   Zcreate_make_block_ptrrP   r&   r   )rc  r~   stridesoffsetsr   orderr"   rP   r   re  r   make_block_ptr  s,    



 "rj  c                 C  s&   t ||dd}t|| j|| jS )NFrd  )rb  r%   r&   Zcreate_advancerP   rG   )rc  rh  r"   r   r   r   advance  s    rk  )FFTF)T)l
__future__r   r   	functoolsr   typingr   r   r   r   r	   Z_C.libtriton.tritonr   rF  r   r%   r   	Exceptionr   r)   r*   r2   r>   rC   rK   rT   rX   rY   r\   r_   rb   rd   rf   rh   ri   rj   ro   rp   rs   rt   ru   rv   rw   rU   rr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rF   rn   r   rI   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r  r$  r  r6  r8  rC  rG  rc   rI  rJ  rK  rL  rM  rN  rQ  rR  rS  rT  rW  r]  r_  rb  rj  rk  r   r   r   r   <module>   s   &      	;v9,$.


&'
'