U
    9%e                  
   @   s  d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d
dlmZ d dlmZ d dlmZmZmZ d dlm Z m!Z!m"Z" e
 Z#e#j$Z$e#j%Z&e#j'Z'dd Z(e&e)eddd Z*e&e)eddd Z+e&e)eddd Z,e&e)eddd Z-e&e)eddd Z.e&e d d!d" Z/e&e d#d$d% Z0e&e d&d'd( Z1e$ej2j3d)d* Z4e$d+e!d,d- Z5e$ej6j7ej8d.d/ Z9d a:d0d1 Z;e$ej<j=ej>ej?d2d3 Z@e$ej<j=ejAej?e$ej<j=ejBej?d4d5 ZCe$ejDj=ej>ej?d6d7 ZEe$ejDj=ejAej?e$ejDj=ejBej?d8d9 ZFe$ejGd:d; ZHe$ejId<d= ZJe$ejKd>d? ZLe$ejMd@dA ZNe$ejMejOdBdC ZPe$ejQejOejOejOejOejOe$ejQejOejOejRejOejOe$ejQejOejOejSejOejOe$ejQejOejOejTejOejOdDdE ZUe$ejVejOejOejWdFdG ZXe$ejYejOejOe$ejYejOejRe$ejYejOejSe$ejYejOejTdHdI ZZe$ej[ejOejOe$ej[ejOejRe$ej[ejOejSe$ej[ejOejTdJdK Z\e$ej]dLdM Z^e$ej_dNdO Z`e$ejaej?dPdQ Zbe$ejcej?ej?ej?dRdS ZddTdU ZeeejfejgdVdW ZheejgejfdXdY ZidZd[ Zjeejfejkd\d] Zleejkejfeej>ejfd^d_ Zmd`da Znenejojpdb enejqdb enejrdb enejojsdc enejtdc enejudc enejojvdd enejwdd enejxdd e$ejojyejfdedf Zze$ej{ejfdgdh Z|e$ejoj}ejfdidj Z~e$eejfdkdl Ze$ejojejfejfejfdmdn Ze$ejejfejfe$ejejfejfdodp ZdqZdrds Ze$ejojejfejfedt e$ejejfejfedt e$ejojejfejfedu e$ejejfejfedu e$ejojejfejfedv e$ejejfejfedv e$ejojejfejfedw e$ejejfejfedw e$ejojejfejfedx e$ejejfejfedx e$ejojejfejfedy e$ejejfejfedy dzd{ Zeejojd|dw eejojd}dy ejd~ejdiZe$ejeje$ejejdd Ze$ejejdd Ze$ejejdd Ze$ejej?dd Ze$ejejOe$ejejdd Ze$ejejRe$ejejdd Ze$ejej?ej?ej?dd Ze$eejSejSdd Ze$eejTejSe$eejSejTe$eejTejTdd Ze$eejSejSdd Ze$eejTejSe$eejSejTe$eejTejTdd Ze$eejSe$eejTdd Ze$eejSejke$eejTejkdd Zdd Zejd Zdej Ze$ejejSee e$ejejTee e$ejejSee e$ejejTee dd Zdd Ze$ejjqej8ejej?e$ejjqej8ejBej?e$ejjqej8ejAej?edd Ze$ejjtej8ejej?e$ejjtej8ejBej?e$ejjtej8ejAej?edd Ze$ejjej8ejej?e$ejjej8ejBej?e$ejjej8ejAej?edd Ze$ejjej8ejej?e$ejjej8ejBej?e$ejjej8ejAej?edd Zdd Zeejjd eejjd eejjd e$ejjej8ejej?e$ejjej8ejBej?e$ejjej8ejAej?edd Ze$ejjej8ejej?e$ejjej8ejAej?e$ejjej8ejBej?edd Ze$ejjej8ejej?e$ejjej8ejAej?e$ejjej8ejBej?edd Ze$ejjej8ejej?e$ejjej8ejAej?e$ejjej8ejBej?edd Ze$ejjej8ejej?e$ejjej8ejAej?e$ejjej8ejBej?edd Ze$ejjej8ej?ej?dd Ze$ejjej8ejej?ej?e$ejjej8ejAej?ej?e$ejjej8ejBej?ej?dd Ze$ejej҃dd ZdddZe'e"dd Zee֡ e$ dS )    )reduceN)ir)Registry
lower_cast)parse_dtype)models)typescgutils)ufunc_db)register_ufuncs   )nvvm)cuda)	nvvmutilsstubserrors)dim3
grid_groupCUDADispatcherc                 C   sB   t | d| }t | d| }t | d| }t| |||fS )Nz%s.xz%s.yz%s.z)r   	call_sregr	   Zpack_struct)builderprefixxyz r   R/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/numba/cuda/cudaimpl.pyinitialize_dim3   s    r   Z	threadIdxc                 C   s
   t |dS )Ntidr   contextr   sigargsr   r   r   cuda_threadIdx    s    r$   ZblockDimc                 C   s
   t |dS )NZntidr   r    r   r   r   cuda_blockDim%   s    r%   ZblockIdxc                 C   s
   t |dS )NZctaidr   r    r   r   r   cuda_blockIdx*   s    r&   ZgridDimc                 C   s
   t |dS )NZnctaidr   r    r   r   r   cuda_gridDim/   s    r'   laneidc                 C   s   t |dS )Nr(   )r   r   r    r   r   r   cuda_laneid4   s    r)   r   c                 C   s   | |dS Nr   extract_valuer    r   r   r   dim3_x9   s    r-   r   c                 C   s   | |dS Nr   r+   r    r   r   r   dim3_y>   s    r/   r   c                 C   s   | |dS )N   r+   r    r   r   r   dim3_zC   s    r1   c                 C   s(   |  tjd}|j}|t||fS r.   )get_constantr   int32modulecallr   Z declare_cudaCGGetIntrinsicHandle)r!   r   r"   r#   onelmodr   r   r   cg_this_gridH   s    r8   zGridGroup.syncc                 C   s,   |  tjd}|j}|t|||fS r*   )r2   r   r3   r4   r5   r   Zdeclare_cudaCGSynchronize)r!   r   r"   r#   flagsr7   r   r   r   ptx_sync_groupQ   s    r:   c                 C   s   |d S r*   r   r    r   r   r   cuda_const_array_like\   s    r;   c                 C   s   t d7 a d| t S )zDue to bug with NVVM invalid internalizing of shared memory in the
    PTX output.  We can't mark shared memory to be internal. We have to
    ensure unique name is generated for shared memory symbol.
    r   z{0}_{1})_unique_smem_idformatnamer   r   r   _get_unique_smem_idf   s    r@   c              	   C   s8   |j d j}t|j d }t| ||f|tdtjddS )Nr   r   _cudapy_smemTshapedtypesymbol_name	addrspacecan_dynsized)r#   literal_valuer   _generic_arrayr@   r   ADDRSPACE_SHAREDr!   r   r"   r#   lengthrD   r   r   r   cuda_shared_array_integerp   s    rM   c              	   C   s>   dd |j d D }t|j d }t| |||tdtjddS )Nc                 S   s   g | ]
}|j qS r   rH   .0sr   r   r   
<listcomp>}   s     z+cuda_shared_array_tuple.<locals>.<listcomp>r   r   rA   TrB   )r#   r   rI   r@   r   rJ   r!   r   r"   r#   rC   rD   r   r   r   cuda_shared_array_tuplez   s    
rT   c              	   C   s4   |j d j}t|j d }t| ||f|dtjddS )Nr   r   _cudapy_lmemFrB   )r#   rH   r   rI   r   ADDRSPACE_LOCALrK   r   r   r   cuda_local_array_integer   s    rW   c              	   C   s:   dd |j d D }t|j d }t| |||dtjddS )Nc                 S   s   g | ]
}|j qS r   rN   rO   r   r   r   rR      s     z(ptx_lmem_alloc_array.<locals>.<listcomp>r   r   rU   FrB   )r#   r   rI   r   rV   rS   r   r   r   ptx_lmem_alloc_array   s    
rX   c                 C   sD   |rt d}|j}tt d}t|||}||d |  S )Nzllvm.nvvm.membar.ctar   	AssertionErrorr4   r   FunctionTypeVoidTyper	   get_or_insert_functionr5   get_dummy_valuer!   r   r"   r#   fnamer7   fntysyncr   r   r   ptx_threadfence_block   s    rc   c                 C   sD   |rt d}|j}tt d}t|||}||d |  S )Nzllvm.nvvm.membar.sysr   rY   r_   r   r   r   ptx_threadfence_system   s    rd   c                 C   sD   |rt d}|j}tt d}t|||}||d |  S )Nzllvm.nvvm.membar.glr   rY   r_   r   r   r   ptx_threadfence_device   s    re   c                 C   s*   |  tjd}ttj}t| |||gS )Nl    )r2   r   r3   noneptx_syncwarp_mask)r!   r   r"   r#   maskZmask_sigr   r   r   ptx_syncwarp   s    ri   c                 C   sD   d}|j }tt tdf}t|||}||| |  S )Nzllvm.nvvm.bar.warp.sync    )	r4   r   r[   r\   IntTyper	   r]   r5   r^   r_   r   r   r   rg      s    rg   c              
   C   s  |\}}}}}|j d }	|	tjkr6||t|	j}d}
|j}tt	tdtdftdtdtdtdtdf}t
|||
}|	jdkr|||||||f}|	tjkr||d}||d}||t }t
|||f}n||td}||| tjd}||td}|||||||f}|||||||f}||d}||d}||d}||td}||td}||| tjd}|||}|	tjkr||t }t
|||f}|S )a  
    The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
    function supports both 32 and 64 bit ints and floats, so for feature parity,
    i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
    an int, then shuffling, then bitcasting back. And 64-bit values by packing
    them into 2 32bit values, shuffling thoose, and then packing back together.
    r0   zllvm.nvvm.shfl.sync.i32rj   r   r   @   )r#   r   real_domainbitcastr   rk   bitwidthr4   r[   LiteralStructTyper	   r]   r5   float32r,   	FloatTypeZmake_anonymous_structtruncZlshrr2   i8zextZshlor_float64
DoubleType)r!   r   r"   r#   rh   modevalueindexclampZ
value_typer`   r7   ra   funcretrvpredZfvZvalue1Z
value_lshrZvalue2Zret1Zret2Zrv1Zrv2Zrv1_64Zrv2_64Zrv_shlr   r   r   ptx_shfl_sync_i32   sJ    

 

r   c                 C   s^   d}|j }tttdtdftdtdtdf}t|||}|||S )Nzllvm.nvvm.vote.syncrj   r   )r4   r   r[   rp   rk   r	   r]   r5   )r!   r   r"   r#   r`   r7   ra   r}   r   r   r   ptx_vote_sync  s    r   c                 C   s   |\}}|j d j}|j d tjkr6||t|}d|}|j}t	tdtdt|f}	t
||	|}
||
||fS )Nr   zllvm.nvvm.match.any.sync.i{}rj   )r#   ro   r   rm   rn   r   rk   r=   r4   r[   r	   r]   r5   r!   r   r"   r#   rh   rz   widthr`   r7   ra   r}   r   r   r   ptx_match_any_sync  s    
"r   c                 C   s   |\}}|j d j}|j d tjkr6||t|}d|}|j}t	t
tdtdftdt|f}	t||	|}
||
||fS )Nr   zllvm.nvvm.match.all.sync.i{}rj   )r#   ro   r   rm   rn   r   rk   r=   r4   r[   rp   r	   r]   r5   r   r   r   r   ptx_match_all_sync  s    
r   c                 C   s,   t jt t dg dddd}||g S )Nrj   zactivemask.b32 $0;=rTZside_effectr   	InlineAsmr[   rk   r5   r!   r   r"   r#   
activemaskr   r   r   ptx_activemask/  s      r   c                 C   s,   t jt t dg dddd}||g S )Nrj   zmov.u32 $0, %lanemask_lt;r   Tr   r   r   r   r   r   ptx_lanemask_lt6  s     r   c                 C   s   | |d S r*   )Zctpopr    r   r   r   ptx_popc>  s    r   c                 C   s
   |j | S N)fmar    r   r   r   ptx_fmaC  s    r   c                 C   sD   ddd}z
||  W S  t k
r>   d|  d}t|Y nX d S )N)Zf32f)Zf64d)rj   rl   z$Conversion between float16 and float unsupportedKeyErrorr   ZCudaLoweringErrorro   typemapmsgr   r   r   float16_float_ty_constraintH  s    

r   c           	      C   sd   |j |j kr|S t|j \}}t| |tdg}t|d| dd| d}|||gS )N   zcvt..f16 $0, $1;=,h)ro   r   r   r[   get_value_typerk   r   r5   	r!   r   fromtytotyvalty
constraintra   asmr   r   r   float16_to_float_castR  s    r   c           	      C   sb   |j |j kr|S t|j \}}ttd| |g}t|d| dd| }|||gS )Nr   cvt.rn.f16. $0, $1;=h,)ro   r   r   r[   rk   r   r   r5   r   r   r   r   float_to_float16_cast^  s    r   c                 C   sH   ddddd}z
||  W S  t k
rB   d|  d}t|Y nX d S )Nchrl)   r   rj   rl   z"Conversion between float16 and intr   r   r   r   r   r   float16_int_constraintj  s    
r   c           
      C   sf   |j }t|}|jrdnd}t| |tdg}t|d| | dd| d}	||	|gS )NrQ   ur   zcvt.rni.r   r   r   )	ro   r   signedr   r[   r   rk   r   r5   
r!   r   r   r   r   ro   r   Z
signednessra   r   r   r   r   float16_to_integer_castt  s    
r   c           
      C   sd   |j }t|}|jrdnd}ttd| |g}t|d| | dd| }	||	|gS )NrQ   r   r   r   r   r   )	ro   r   r   r   r[   rk   r   r   r5   r   r   r   r   integer_to_float16_cast  s    
r   c                    s    t | tjtj fdd}d S )Nc                    sB   t t dt dt dg}t |  dd}|||S )Nr   z.f16 $0,$1,$2;=h,h,hr   r[   rk   r   r5   r!   r   r"   r#   ra   r   opr   r   ptx_fp16_binary  s
    z*lower_fp16_binary.<locals>.ptx_fp16_binarylowerr   float16)fnr   r   r   r   r   lower_fp16_binary  s    r   addsubmulc                 C   s4   t t dt dg}t |dd}|||S )Nr   zneg.f16 $0, $1;=h,hr   r   r   r   r   ptx_fp16_hneg  s    r   c                 C   s   t | |||S r   )r   r    r   r   r   operator_hneg  s    r   c                 C   s4   t t dt dg}t |dd}|||S )Nr   zabs.f16 $0, $1;r   r   r   r   r   r   ptx_fp16_habs  s    r   c                 C   s   t | |||S r   )r   r    r   r   r   operator_habs  s    r   c                 C   sH   t dt dt dg}t t d|}t |dd}|||S )Nr   zfma.rn.f16 $0,$1,$2,$3;z=h,h,h,h)r   rk   r[   r   r5   )r!   r   r"   r#   Zargtysra   r   r   r   r   ptx_hfma  s    r   c                 C   s   dd }|  ||||S )Nc                 S   s   t j| |S r   )r   fp16Zhdiv)r   r   r   r   r   fp16_div  s    zfp16_div_impl.<locals>.fp16_divZcompile_internal)r!   r   r"   r#   r   r   r   r   fp16_div_impl  s    r   z{{
          .reg .pred __$$f16_cmp_tmp;
          setp.{op}.f16 __$$f16_cmp_tmp, $1, $2;
          selp.u16 $0, 1, 0, __$$f16_cmp_tmp;
        }}c                    s    fdd}|S )Nc           	         sr   t t dt dt dg}t |tj dd}|||}| tj	d}|
|t d}|d||S )Nr   r   r   r   z!=)r   r[   rk   r   	_fp16_cmpr=   r5   r2   r   Zint16rn   Zicmp_unsigned)	r!   r   r"   r#   ra   r   resultzeroZ
int_resultr   r   r   ptx_fp16_comparison  s    "z*_gen_fp16_cmp.<locals>.ptx_fp16_comparisonr   )r   r   r   r   r   _gen_fp16_cmp  s    r   eqnegegtleltc                    s    t | tjtj fdd}d S )Nc                    s(   t  | |||}|||d |d S )Nr   r   )r   select)r!   r   r"   r#   choicer   r   r   ptx_fp16_minmax  s    z*lower_fp16_minmax.<locals>.ptx_fp16_minmaxr   )r   r`   r   r   r   r   r   lower_fp16_minmax  s    r   maxminZ
__nv_cbrtfZ	__nv_cbrtc           
      C   sF   |j }t| }| |}|j}t||g}t|||}	||	|S r   )	return_type
cbrt_funcsr   r4   r   r[   r	   r]   r5   )
r!   r   r"   r#   r   r`   Zftyr7   ra   r   r   r   r   ptx_cbrt  s    
r   c              	   C   s2   t |jttdtdfd}|||S )Nrj   Z	__nv_brevr	   r]   r4   r   r[   rk   r5   r!   r   r"   r#   r   r   r   r   ptx_brev_u4  s    r   c              	   C   s2   t |jttdtdfd}|||S )Nrl   Z__nv_brevllr   r   r   r   r   ptx_brev_u8  s    r   c                 C   s   | |d | tjdS r*   )Zctlzr2   r   booleanr    r   r   r   ptx_clz'  s    r   c              	   C   s2   t |jttdtdfd}|||S )Nrj   Z__nv_ffsr   r   r   r   r   
ptx_ffs_32.  s    r   c              	   C   s2   t |jttdtdfd}|||S )Nrj   rl   Z
__nv_ffsllr   r   r   r   r   
ptx_ffs_648  s    r   c                 C   s   |\}}}| |||S r   )r   )r!   r   r"   r#   testabr   r   r   ptx_selpB  s    
r   c              	   C   s4   t |jtt t t fd}|||S )NZ
__nv_fmaxfr	   r]   r4   r   r[   rr   r5   r   r   r   r   
ptx_max_f4H  s    r   c              
   C   sh   t |jtt t t fd}||| ||d |jd t	j
| ||d |jd t	j
gS )NZ	__nv_fmaxr   r   r	   r]   r4   r   r[   rx   r5   castr#   r   doubler   r   r   r   
ptx_max_f8S  s    r   c              	   C   s4   t |jtt t t fd}|||S )NZ
__nv_fminfr   r   r   r   r   
ptx_min_f4d  s    r   c              
   C   sh   t |jtt t t fd}||| ||d |jd t	j
| ||d |jd t	j
gS )NZ	__nv_fminr   r   r   r   r   r   r   
ptx_min_f8o  s    r   c              	   C   sJ   t |jttdt fd}||| ||d |j	d t
jgS )Nrl   Z__nv_llrintr   )r	   r]   r4   r   r[   rk   rx   r5   r   r#   r   r   r   r   r   r   	ptx_round  s    r   c                 C   s   dd }|  ||||S )Nc                 S   s   t | st | r| S |dkrb|dkr:d|d  }d}nd| }d}| | | }t |rt| S nd|  }| | }t|}t || dkrdt|d  }|dkr|| | }n||9 }|S )Nr      g      $@gMDg      ?g      ?g       @)mathisinfisnanroundfabs)r   ndigitsZpow1Zpow2r   r   r   r   r   round_ndigits  s(    

z$round_to_impl.<locals>.round_ndigitsr   )r!   r   r"   r#   r   r   r   r   round_to_impl  s    !r  c                    s    fdd}|S )Nc                    s$   |j \}| | }|||d S r*   )r#   r2   Zfmul)r!   r   r"   r#   Zargtyfactorconstr   r   impl  s    zgen_deg_rad.<locals>.implr   )r  r  r   r  r   gen_deg_rad  s    r  g     f@c                    s   |t jkr t j|dd}|g}ntj |t|d} fddt||D }|j}||krltd||f |j	t|krtd|j	t|f ||fS )z4
    Convert integer indices into tuple of intp
    r   )rD   count)r  c                    s"   g | ]\}}  ||tjqS r   )r   r   intp)rP   tir   r!   r   r   rR     s   z&_normalize_indices.<locals>.<listcomp>zexpect %s but got %sz#indexing %d-D array with %d-D index)
r   Zinteger_domainUniTupler	   Zunpack_tuplelenziprD   	TypeErrorndim)r!   r   indtyindsarytyvaltyindicesrD   r   r  r   _normalize_indices  s    
r  c                    s    fdd}|S )Nc                    sj   |j \}}}|\}}}	|j}
t| |||||\}}| || ||}tj| ||||dd} | ||
||	S )NTZ
wraparound)r#   rD   r  
make_arrayr	   get_item_pointer)r!   r   r"   r#   r  r  r  aryr  r   rD   r  laryptrdispatch_fnr   r   imp  s    

 z_atomic_dispatcher.<locals>.impr   )r  r  r   r  r   _atomic_dispatcher  s    r   c                 C   s`   |t jkr&|j}|t|||fS |t jkrL|j}|t|||fS |d||dS d S )Nr   	monotonic)	r   rq   r4   r5   r   Zdeclare_atomic_add_float32rw   Zdeclare_atomic_add_float64
atomic_rmwr!   r   rD   r  r   r7   r   r   r   ptx_atomic_add_tuple  s    

r$  c                 C   s`   |t jkr&|j}|t|||fS |t jkrL|j}|t|||fS |d||dS d S )Nr   r!  )	r   rq   r4   r5   r   Zdeclare_atomic_sub_float32rw   Zdeclare_atomic_sub_float64r"  r#  r   r   r   ptx_atomic_sub  s    

r%  c                 C   sP   |t jjkr<|j}|j}ttd| }|||||fS td| dd S )NZdeclare_atomic_inc_intzUnimplemented atomic inc with  array	r   cudadeclZunsigned_int_numba_typesro   r4   getattrr   r5   r  r!   r   rD   r  r   bwr7   r   r   r   r   ptx_atomic_inc  s    r,  c                 C   sP   |t jjkr<|j}|j}ttd| }|||||fS td| dd S )NZdeclare_atomic_dec_intzUnimplemented atomic dec with r&  r'  r*  r   r   r   ptx_atomic_dec"  s    r-  c                    s@   t  fdd}tjtjtjfD ]}t| tj|tj| q d S )Nc                    s6   |t jjkr| ||dS td  d| dd S )Nr!  zUnimplemented atomic z with r&  r   r(  integer_numba_typesr"  r  r!   r   rD   r  r   r   r   r   impl_ptx_atomic1  s    z+ptx_atomic_bitwise.<locals>.impl_ptx_atomic)r   r   r  r  Tupler   ArrayAny)Zstubr   r1  r   r   r   r   ptx_atomic_bitwise0  s    r5  andorxorc                 C   s0   |t jjkr|d||dS td| dd S )NZxchgr!  zUnimplemented atomic exch with r&  r.  r0  r   r   r   ptx_atomic_exchA  s    r9  c                 C   s   |j }|tjkr&|t|||fS |tjkrF|t|||fS |tjtj	fkrh|j
d||ddS |tjtjfkr|j
d||ddS td| d S Nr   r!  ZorderingZumaxz&Unimplemented atomic max with %s array)r4   r   rw   r5   r   Zdeclare_atomic_max_float64rq   Zdeclare_atomic_max_float32r3   int64r"  uint32uint64r  r#  r   r   r   ptx_atomic_maxL  s    

r?  c                 C   s   |j }|tjkr&|t|||fS |tjkrF|t|||fS |tjtj	fkrh|j
d||ddS |tjtjfkr|j
d||ddS td| d S Nr   r!  r;  Zuminz&Unimplemented atomic min with %s array)r4   r   rw   r5   r   Zdeclare_atomic_min_float64rq   Zdeclare_atomic_min_float32r3   r<  r"  r=  r>  r  r#  r   r   r   ptx_atomic_min`  s    

rA  c                 C   s   |j }|tjkr&|t|||fS |tjkrF|t|||fS |tjtj	fkrh|j
d||ddS |tjtjfkr|j
d||ddS td| d S r:  )r4   r   rw   r5   r   Zdeclare_atomic_nanmax_float64rq   Zdeclare_atomic_nanmax_float32r3   r<  r"  r=  r>  r  r#  r   r   r   ptx_atomic_nanmaxt  s    

rB  c                 C   s   |j }|tjkr&|t|||fS |tjkrF|t|||fS |tjtj	fkrh|j
d||ddS |tjtjfkr|j
d||ddS td| d S r@  )r4   r   rw   r5   r   Zdeclare_atomic_nanmin_float64rq   Zdeclare_atomic_nanmin_float32r3   r<  r"  r=  r>  r  r#  r   r   r   ptx_atomic_nanmin  s    

rC  c                 C   sT   | |jd tj|jd |jd }|d | tjd|d |d f}t| |||S )Nr   r   r0   )r   r#   r   r  r2   ptx_atomic_casr    r   r   r   ptx_atomic_compare_and_swap  s    $"rE  c                 C   s   |j \}}}}|\}}	}
}t| |||	||\}}| || ||}tj| ||||dd}|jtjjkr|j	}|jj
}t|||||
|S td|j d S )NTr  z&Unimplemented atomic cas with %s array)r#   r  r  r	   r  rD   r   r(  r/  r4   ro   r   Zatomic_cmpxchgr  )r!   r   r"   r#   r  r  Zoldtyr  r  r  oldr   r  r  r  r7   ro   r   r   r   rD    s    rD  c                 C   s@   t jt t  t dgdddd}|d }|||g d S )Nrj   znanosleep.u32 $0;r   Tr   r   )r   r   r[   r\   rk   r5   )r!   r   r"   r#   	nanosleepnsr   r   r   ptx_nanosleep  s      rI  Fc               	      sf  t tj|d}|dko$|o$t|dk}|dkr:|s:td j| }	t|tjtj	fpjt|	t
jpj|tjk}
|tjkr|
std|  |}t||}|tjkrtj|||d}nh|j}t||||} |}d|d  > |_|rd|_nt|tj|_||t t!dd}t"#t$ j%} |}|&|}|}g }t't(|D ]\}}|)| ||9 }qXd	d
 t(|D } fdd
|D }|rtj*t+t!dg dddd}|,|-|g t!d} .tj/|}|0||g}n fdd
|D }t|}tj1||dd} 2| |} j3||4||j5j6|| .tj/|d d |7 S )Nr   r   zarray length <= 0zunsupported type: %sr>   Zexternalr   Zgenericc                 S   s   g | ]}|qS r   r   rO   r   r   r   rR     s     z"_generic_array.<locals>.<listcomp>c                    s   g | ]}  tj|qS r   r2   r   r  rO   r!   r   r   rR     s     rj   zmov.u32 $0, %dynamic_smem_size;r   Tr   rl   c                    s   g | ]}  tj|qS r   rJ  rO   rK  r   r   rR     s     C)rD   r  Zlayout)datarC   stridesitemsizeZmeminfo)8r   operatorr   r  
ValueErrorZdata_model_manager
isinstancer   ZRecordBooleanr   ZStructModelr   Znumber_domainr  Zget_data_typer   	ArrayTyper   rV   r	   Zalloca_oncer4   Zadd_global_variableZget_abi_sizeof
bit_lengthalignlinkageConstant	UndefinedZinitializerZaddrspacecastZPointerTyperk   llZcreate_target_dataZNVVMZdata_layoutZget_abi_size	enumeratereversedappendr   r[   ru   r5   r2   r  Zudivr3  r  Zpopulate_arrayrn   rM  typeZ	_getvalue) r!   r   rC   rD   rE   rF   rG   Z	elemcountZdynamic_smemZ
data_modelZother_supported_typeZlldtypeZlarytyZdataptrr7   ZgvmemrV  Z
targetdatarO  Z
laststrideZrstridesr
  ZlastsizerN  ZkstridesZget_dynshared_sizeZdynsmem_sizeZ	kitemsizeZkshaper  r  r  r   rK  r   rI     sz    








 rI   c                 C   s   |   S r   )r^   )r!   r   r   Zpyvalr   r   r   cuda_dispatcher_const*  s    r_  )F)	functoolsr   rP  r   Zllvmliter   Zllvmlite.bindingZbindingrZ  Znumba.core.imputilsr   r   Znumba.core.typing.npydeclr   Znumba.core.datamodelr   Z
numba.corer   r	   Znumba.npr
   Znumba.np.npyimplr   Zcudadrvr   Znumbar   Z
numba.cudar   r   r   Znumba.cuda.typesr   r   r   registryr   Zlower_getattrZ
lower_attrZlower_constantr   Moduler$   r%   r&   r'   r)   r-   r/   r1   ZcgZ	this_gridr8   r:   r  Z
array_liker3  r;   r<   r@   ZsharedarrayZIntegerLiteralr4  rM   r2  r  rT   localrW   rX   Zthreadfence_blockrc   Zthreadfence_systemrd   Zthreadfencere   Zsyncwarpri   i4rg   Zshfl_sync_intrinsicrt   Zf4Zf8r   Zvote_sync_intrinsicr   r   Zmatch_any_syncr   Zmatch_all_syncr   r   r   Zlanemask_ltr   Zpopcr   r   r   r   r   Floatr   r   r   Integerr   r   r   r   Zhaddr   iaddZhsubr   isubZhmulr   imulZhnegr   negr   Zhabsr   absr   Zhfmar   truedivitruedivr   r   r   Zheqr   hner   Zhger   Zhgtr   hler   Zhltr   r   ZhmaxZhminrq   rw   r   Zcbrtr   ZbrevZu4r   u8r   Zclzr   Zffsr   r   Zselpr   r   r   r   r   r   r   r   r   r  r  piZ_deg2radZ_rad2degradiansdegreesr  r   Zatomicr  r$  r%  incr,  decr-  r5  and_rv   r8  Zexchr9  r?  rA  ZnanmaxrB  ZnanminrC  Zcompare_and_swaprE  ZcasrD  rG  r=  rI  rI   r_  Z
get_ufuncsr   r   r   r   <module>   s  














		
		







	.










	





  










%





 
d
