U
    9%ew{                     @   s(  d dl Z d dlZd dlmZ dd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd ZdCddZdd  Zd!d" Zd#d$ Ze rd dlZd dlmZ d d%lmZmZ ejejejejejejejd&d'd(Zejejejejejejd)d*d+Zd,d- Z d.d/ Z!d0d0dd1dd2ej"ej"ej"eej" e#eeee$ ee$ ee$ f  d3d4d5Z%dd1dd6ej"ej"eej" e#eeee$ ee$ ee$ f  d7d8d9Z&ejejejd:d;d<Z'dDd=d>Z(dEej"ej"ej"eej" e)e#ee) d@dAdBZ*ndZ(dZ&dZ%dZ*dS )F    N)get_device_capabilityc                  C   sF   t j sdS zdd l} | d k	o(t dkW S  tk
r@   Y dS X d S )NFr   )   r   )torchcudaZis_availabletritonr   ImportError)r    r   W/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/torch/sparse/_triton_ops.py_has_triton   s    
r
   c                 C   s   | st |d S N)
ValueError)Zcondmsgr   r   r	   check   s    r   c                 C   s   t |jtjk|  d d S )Nz@(): only BSR sparse format is supported for the sparse argument.)r   layoutr   
sparse_bsr)f_nametr   r   r	   check_bsr_layout   s    
r   c                 C   s&   t |j|ko|jjdk|  d d S )Nr   z9(): all inputs are expected to be on the same GPU device.)r   devicetype)r   r   r   r   r   r	   check_device   s    r   c                 C   s   t | dko| dk|  d|  d|  d |jdd  \}}|jdd  \}}t ||k|  d| d| d d S )N   zc(): all inputs involved in the matrix product are expected to be at least 2D, but got lhs.dim() == z and rhs.dim() == .zw(): arguments' sizes involved in the matrix product are not compatible for matrix multiplication, got lhs.shape[-1] == z( which is not equal to rhs.shape[-2] == )r   dimshape)r   lhsrhsmklkrnr   r   r	   check_mm_compatible_shapes%   s    r"   c                 G   sF   t |j|ko(|jtjtjtjft|  k|  d| d|j d d S )Nz\(): all inputs are expected to be of the same dtype and one of (half, bfloat16, float32) or z, but got dtype == r   )r   dtyper   halfbfloat16floattuple)r   r   r#   Zadditional_dtypesr   r   r	   check_dtype6   s
    
r(   c                    sP   t |dkstdd   fdd}t|||  d|d  d|d	  d
 d S )Nr   c                 S   s   | | d @  S N   r   )vr   r   r	   is_power_of_twoC   s    z(check_blocksize.<locals>.is_power_of_twoc                    s&   d}| D ]}|dko |o|}q|S )NT   r   )bres	blocksizer,   r   r	   is_compatible_blocksizeF   s    z0check_blocksize.<locals>.is_compatible_blocksizez(): sparse inputs' blocksize (r   z, r*   z;) should be at least 16 and a power of 2 in each dimension.)lenAssertionErrorr   )r   r0   r2   r   r1   r	   check_blocksize@   s    r5   c                 C   s,   |  ddkr$|  ddkr$|  S | S d S )Nr   r*   )stride
contiguous)r   r   r   r	   make_triton_contiguousT   s    r9   c                 G   s@   zt jdd |D  W S  tk
r:   td|  d Y nX d S )Nc                 s   s   | ]}|j d d V  qd S Nr   r   .0r   r   r   r	   	<genexpr>]   s     z'broadcast_batch_dims.<locals>.<genexpr>Fz3(): inputs' batch dimensions are not broadcastable!)r   broadcast_shapes	Exceptionr   )r   tensorsr   r   r	   broadcast_batch_dims[   s    rB   c                 g   s2   |D ](}t d g|  }||| < || V  qd S r   )slicer   )r   Zslice_rangerA   r   slicesr   r   r	   slicerb   s    rE   c                 g   sN   |D ]D}t d g|  }t| |D ]\}}|d k	r$|||< q$|| V  qd S r   )rC   r   zip)ZdimsrD   rA   r   sdZd_slicer   r   r	   multidim_sliceri   s    
rI   c                  g   s"   | D ]}|V  |  E d H  qd S r   )r7   )rA   r   r   r   r	   ptr_stride_extractorr   s    rJ   c           	      #   s   dt    krdksn tdt   kr6dks<n tdd l} fdd}fdd}|j|  D ]H}dd t |D }d	d t||D }|d d d
 f||V  qjd S )Nr      c                  3   s&   t  D ]\} }td| |V  q
d S )Nr   )rF   range)fgmg)	full_gridgrid_blocksr   r	   generate_grid_points~   s    z.grid_partitioner.<locals>.generate_grid_pointsc                 3   s(      D ]\}}tt|| |V  qd S r   )itemsnextrI   )rD   r   Zt_dims)tensor_dims_mapr   r	   generate_sliced_tensors   s    z1grid_partitioner.<locals>.generate_sliced_tensorsc                 S   s    g | ]\}}}t || |qS r   )min)r=   rM   gprN   r   r   r	   
<listcomp>   s     z$grid_partitioner.<locals>.<listcomp>c                 S   s   g | ]\}}t ||| qS r   )rC   )r=   rW   gr   r   r	   rX      s     r6   )r3   r4   	itertoolsproductrF   )	rO   rP   rT   rZ   rQ   rU   Z
grid_pointgridrD   r   )rO   rP   rT   r	   grid_partitionerx   s    r]   c                    sh   dd d d }|d kr|}n$dd  t  fddt||D }t|||D ]^}}| |f|  qLd S )N)i  r^   r6   c                 S   s    | d kr|S t dt| |S d S r)   )maxrV   )rY   rN   r   r   r	   valid_grid_dim   s    z%launch_kernel.<locals>.valid_grid_dimc                 3   s   | ]\}} ||V  qd S r   r   )r=   rY   rN   r`   r   r	   r>      s    z launch_kernel.<locals>.<genexpr>)r'   rF   r]   )kernelrT   rO   rP   Zcuda_max_gridr\   sliced_tensorsr   ra   r	   launch_kernel   s    rd   c                    s   |   d}|  d}t|  d}dd |D }tj|jd d fdd |D  dd   |d	} |d	} ||jdd  } fd
d|D }|||f|S )Nr   c                 S   s   g | ]}t |d qS )r   )r9   	unsqueezer<   r   r   r	   rX      s     z"prepare_inputs.<locals>.<listcomp>c                 s   s   | ]}|j d d V  qd S r:   r;   r<   r   r   r	   r>      s     z!prepare_inputs.<locals>.<genexpr>c                 S   s   |  || dt|d S )Nr   r*   )broadcast_toflattenr3   )r   Z
batch_dimsZinvariant_dimsr   r   r	   batch_broadcast_and_squash   s     
z2prepare_inputs.<locals>.batch_broadcast_and_squashr6   c                    s"   g | ]} ||j d d qS )r   Nr;   r<   ri   Zbatch_dims_broadcastedr   r	   rX      s    )crow_indicesre   col_indicesr9   valuesr   r?   r   )bsrZdense_tensorsrl   rm   rn   rA   r   rk   r	   prepare_inputs   s2    $      rp   c                 G   s|   t | |f| }| |d }| |d }| || jdd   }||jdd   }tj|||||jdS )Nrj   rf   r   sizer   )	rB   rl   rg   rm   rn   r   r   sparse_compressed_tensorr   )r   ro   rA   Zbatch_shaperl   rm   rn   rr   r   r   r	   broadcast_batch_dims_bsr   s     rt   c                 C   sH   | j ^ }}}|||d  |d ||d  |d g }| |ddS )Nr   r*   rf   r   )r   reshape	transpose)r   r0   restr   r!   Z	new_shaper   r   r	   tile_to_blocksize   s    

rx   )OptionalTuple)IS_BETA_ZEROBLOCKSIZE_ROWBLOCKSIZE_COLTILE_K	acc_dtype
allow_tf32c            5   	   C   s2  t jdd} t jdd}!|||   ||!  }"t |"}#t |"| }$|$|# }%|%dkrXd S t d|}&t d|}'|||   |	|#  |
|&d d d f   ||'d d d f   }(|||   ||#  })|||   ||!  ||&d d d f   }*|||   ||'d d d f   }+t d|},t|%D ]}-t j||f|d}.t |)}/td||D ]}0|0|, }1|1|k }2t j|*||1d d d f   |2d d d f dd}3t j|+||/  ||1d d d f   |2d d d f dd}4|.t j|3|4|d7 }.qJ|r|.| 9 }.n| |. |t |(  }.t |(|.|j	j
 |(|	7 }(|)|7 })qd S )Nr*   Zaxisr   r#           maskotherr   )tl
program_idloadarangerL   zerosdotstoretor#   
element_ty)5alphabetar{   r|   r}   kr~   
values_ptrvalues_batch_stridevalues_nnz_stridevalues_row_block_stridevalues_col_block_stridecrow_indices_ptrcrow_indices_batch_stridecrow_indices_stridecol_indices_ptrcol_indices_batch_stridecol_indices_strideZmat1_ptrZmat1_batch_strideZmat1_tiled_row_strideZmat1_tiled_col_strideZmat1_row_block_strideZmat1_col_block_strideZmat2_ptrZmat2_batch_strideZmat2_tiled_row_strideZmat2_tiled_col_strideZmat2_row_block_strideZmat2_col_block_strider   r   	batch_pidrow_block_pidcrow_indices_offset_ptr
nnz_offsetnnz_offset_nextrow_nnzrow_block_arangecol_block_arangevalues_block_ptrscol_index_nnz_ptrZmat1_block_ptrsZmat2_block_ptrsZk_tile_arange_Z	acc_block	col_blockZk_tileZ	k_offsetsZmask_kZ
mat1_blockZ
mat2_blockr   r   r	   _sampled_addmm_kernel   s    #
		
  
r   )r|   r}   r   r   GROUP_SIZE_ROWc           0      C   s   t jdd}t jdd}t jdd}t jdd}t jdd} t |||| |\}}|||  |	|  }!t |!}"t |!|	 }#|#|" }$|$dkrd S t d| }%t d|}&|||  ||"  ||%d d d f   ||&d d d f   }'|||  ||  ||&d d d f   ||%d d d f   }(|||  ||  ||  ||%d d d f   ||%d d d f   })|
||  ||"  }*t j| | f|d}+t|$D ]P},t |'}-t |*}.t |(||.  }/|+t j|-|/|d7 }+|'|7 }'|*|7 }*qt 	|)|+
|jj d S )Nr   r   r   r*   r   r   )r   r   Znum_programsZ	swizzle2dr   r   r   rL   r   r   r   r#   r   )0r|   r}   r   r   r   r   r   r   r   r   r   r   r   Z	dense_ptrZdense_batch_strideZdense_tiled_row_strideZdense_tiled_col_strideZdense_row_block_strideZdense_col_block_strideZ
output_ptrZoutput_batch_strideZoutput_tiled_row_strideZoutput_tiled_col_strideZoutput_row_block_strideZoutput_col_block_strider   r   r   r   r   Zcol_block_pidn_block_rowsn_block_colsr   r   r   r   r   r   r   Zdense_block_ptrsZoutput_ptrsr   Zoutput_acc_blockr   Zvalues_blockZdense_row_idxZdense_blockr   r   r	   "_bsr_strided_dense_rowspace_kernel\  s    )    



r   c              
      s   | d}| dd }| d}	||	|f}
|d k	rht|d d d d d ddt|d d    }nd }|d|d|d|d	|d
i}|jtjtjfkrtj dn
tj	 d fdd}t
|||
| d S )Nr   r6   r*   rf   rK   r   r   NNr   Nr6   )r   rf   N)r   rf   TFc                    s&   t |  t|  dddd d S )N   r*   )r   r   r   
num_stages	num_warps)r   rJ   r\   rc   r   r   r0   r   r	   rb     s    z*_run_dense_rowspace_kernel.<locals>.kernelrr   r'   r3   r#   r   r$   r%   r   float32Zfloat64rd   )r0   rn   rl   rm   denseoutputmax_grid	n_batchesr   r   rO   rP   rT   rb   r   r   r	   _run_dense_rowspace_kernel  s2    


4     r   c              
      s   | d}| dd }||f}|d k	r\t|d d d d d ddt|d d    }nd }|d|d|d|	d|
di}|jtjtjfkrtj d	n
tj	 d
 fdd}t
|||| d S )Nr   r6   r*   r   r   )r   N)r   r6   )r   r   TFc                    s2   t |  fft|  ddd d S )Nr*   r   )r   r   r   r   )r   rJ   r   r   r   r   r   r0   is_beta_zeror   tile_kr   r	   rb     s         z)_run_sampled_addmm_kernel.<locals>.kernelr   )r   r   r   r0   r   r   rn   rl   rm   mat1mat2r   r   r   rO   rP   rT   rb   r   r   r	   _run_sampled_addmm_kernel   s0    
4     r   g      ?F)r   r   outskip_checksr   )inputr   r   r   r   r   c                C   sn  d}t ||  t|| ||}	|s&t||| j t||| j |dkrh| jtjkrhtd| d| d | jtjk	rt||| j t||| j nt|||j t	||| |d k	r&t || t|||j t||| j t|j
|	j
ko| |  k| d|	j
 d|	  d|j
 d	|  	 |d krB|	j|jd
d}n
||	 | dksh| dkrl|S | j
dd  }
|d}|d}|d}|dks|dkr| | |S |}t|||\}}}}}t||
d |f}t|||
d f}t|
 }t|||dk|
|||||||| |  dd  | dd  krj| || j
 |S )Nsampled_addmmr   Fz(): having beta == z3 not equal to 0.0 with boolean mask is not allowed.z!(): Expects `out` to be of shape z and with nnz equal to z but got out.shape = z and out.nnz = T)copyr   r   r6   r*   rf   )r   rt   r   r   r#   r   boolr   r(   r"   r   _nnzr   Zcopy_numelrn   rr   mul_rp   rx   r_   r   r7   ru   )r   r   r   r   r   r   r   r   r   Zinput_broadcastedr0   r   r!   r   
out_backuprl   rm   rn   r   r   r   r	   r   +  sv    


*




       &r   )r   r   r   )ro   r   r   r   r   c                C   s  d}|st ||  t|| |j t|| |j t|| | | d}|d}|  jdd  \}}	t	||  d| d| d t
|||	f n$| jdd  \}}
|jdd  \}}t|| |}|d k	r|s|||f }t	|j|kd| d|j d t	| p|dd d	 |d kr:||||f }|  d
krP| S |  jdd  }|}t| ||\}}}}}t||d d d }t||d
 |d
 f}t||||||| |S )Nbsr_dense_mmr   r6   z"bsr_dense_mm(): dense.size(-1) == z( should be divisible by blocksize[0] == r   z9bsr_dense_mm(): `out` argument has wrong shape, expected z
, but got zbsr_dense_mm(): only row-major/col-major `out` arguments are supported, i.e. (out.is_contiguous() or out.transpose(-2, -1).is_contiguous()) should be True.r   )r   r   r   r(   r#   r"   rr   rn   r   r   r5   rB   is_contiguousrv   Z	new_emptyr   Zzero_rp   rx   r   )ro   r   r   r   r   r   r   r!   	row_blockr   r   r    Zoriginal_batch_dims_broadcastedZexpected_out_shaper0   r   rl   rm   rn   r   r   r	   r     sL    



r   )MAX_ROW_NNZTILEc                 C   sV  t jdd}t jdd}t jdd}| ||  ||  }t |}t || }|| }|dkrdd S t d|
}||| k }|||  ||  ||  }t j|| |td dt j}t j|dd}t|
|	|
D ]Z}||
7 }||| k }t j|| |td dt j}t j|dd}t 	||k||}qt 
|| }t j|dd}t|
|	|
D ]\}||
8 }||| k }t j|| |td dt j}t 
|| }|t j|dd7 }qXt j|| || |jj|d t|
|	|
D ]l}||
7 }||| k }t j|| |td dt j}t 
|| }t j|| || |jj|d qd S )Nr   r   r*   r   infr   )r   )r   r   r   r   r&   r   r   r_   rL   whereexpsumr   r#   r   )r   r   r   r   r   r   Zvalues_nnz_col_block_strider   r   r   r   r   Zrow_block_offset_pidr   r   r   r   r   Z
row_aranger   Zcurr_row_values_ptrsZrow_tileZmax_row_valuer   Zcurr_max_row_valuenumdenomr   r   r	   _bsr_softmax_kernel  s^    
"""""r   c                    s  d}t ||  t|| | j |  dks4|  dkr<|  S | jdd  \}}|  }|  jdd  \ d krt	|n
t	| 
 ddd}|  dd r|   }n|  }|dd dddd|  }|jd | f}d }	|dd df d|d	i}
 fd
d}t||
||	 |d| ddj|  j }tj| 
  |   || j| jdS )Nbsr_softmaxr   r   rf   r   r6   .r   r   c                    s&   t |  t|  tdf  d S )Ni   )r   rJ   rV   r   r   max_row_nnzr   r   r	   rb   <  s     zbsr_softmax.<locals>.kernelrq   )r   r(   r#   r   r   cloner   rn   r   Znext_power_of_2rl   re   rh   rv   r   r8   ru   rd   r   rs   rm   r   )r   r   r   r   r!   Znnzrl   rn   rO   rP   rT   rb   r   r   r	   r     sB    

,  	$

r   r   )querykeyvalue	attn_mask	dropout_p	is_causalscalec           
      C   sl  d}t | | d t |d k	| d |d k	s6tt |jtjk| dtj d|j d t||| j t||| j t||| j t||| j t||| j |jtj	k	rt||| j t
|| |ddd	d
d}|d kr| ddks|d	krt d
| d| d |d kr,dt| d n|}	| |	 t|}tjjj| |dd t||}|S )N_scaled_dot_product_attentionz'(): is_causal == True is not supported.z'(): attn_mask == None is not supported.z(): attn_mask.layout must be z, but got attn_mask.layout == r   r   r6   r   F)r   r   r   z(): current value of scale == z results in division by zero.r*   T)pZinplace)r   r4   r   r   r   r   r   r(   r#   r   r   rv   rr   mathsqrtrn   r   r   nnZ
functionalZdropoutr   )
r   r   r   r   r   r   r   r   ZsdpaZscale_factorr   r   r	   r   Q  sB    	
 "
r   )N)N)r   FN)+r   r   Ztorch._inductor.cuda_propertiesr   r
   r   r   r   r"   r(   r5   r9   rB   rE   rI   rJ   r]   rd   rp   rt   rx   r   Ztriton.languagelanguager   typingry   rz   ZjitZ	constexprr   r   r   r   ZTensorr   intr   r   r   r   r&   r   r   r   r   r	   <module>   s   
	
!z"{(0XO
F
A   2