U
    9%eY                  	   @  s  d dl mZmZ d dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
mZmZ d dlmZmZmZmZmZmZmZmZmZ ddlmZmZ ejejejeZdZd.dd	Z d
d Z!dd Z"dd Z#edZ$G dd dej%Z&e' dd Z(G dd dee$ Z)G dd de)e$ Z*edddddZ+eddddddddd d!d"dZ+d/dddddd#d$ddddd%d&d'dZ+G d(d) d)Z,G d*d+ d+Z-d,d- Z.dS )0    )annotationsdivisionN)defaultdict
namedtuple)	CallableGenericIterableListOptionalTypeVarUnioncastoverload   )get_backendpath_to_ptxasz2.1.0c                 C  sT   | d krt  } zddlm} || W S  tk
rN   dd l}|j| j Y S X d S )Nr   )_cuda_getCurrentRawStream)get_current_deviceZtorch._Cr   ImportErrortorchcudaZcurrent_streamZcuda_stream)idxr   r    r   Q/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/triton/runtime/jit.pyget_cuda_stream   s    
r   c                  C  s   dd l } | j S Nr   )r   r   Zcurrent_device)r   r   r   r   r      s    r   c                 C  s   dd l }|j|  d S r   )r   r   Z
set_devicer   r   r   r   r   set_current_device$   s    r   c                 C  s   dd l }|j| S r   )r   r   get_device_capabilityr   r   r   r   r   )   s    r   Tc                      s>   e Zd ZdZdd fddZdd Zdd	 Zd
d Z  ZS )DependenciesFinderz
    This AST visitor is used to find dependencies of a JITFunction. This can
    be used to invalidate a JITFunction's hash when its source code -- or
    that of its dependencies -- changes.
    Nonereturnc                   s*   t    t|d | _|| _d S )Nutf-8)super__init__hashlibmd5encode	hexdigestretglobals)selfr,   src	__class__r   r   r&   <   s    
zDependenciesFinder.__init__c                 C  s   | j |jd S N)r,   getid)r-   noder   r   r   
visit_NameA   s    zDependenciesFinder.visit_Namec                 C  s`   |  |j}t|tjr&|  |j}q|d ksPt|dddksPt|dddrTd S t||jS )N__name__ Ztritonz.triton)visitvalue
isinstanceast	Attributegetattrendswithattr)r-   r4   lhsr   r   r   visit_AttributeD   s    *z"DependenciesFinder.visit_Attributec                 C  s   |  |j}|d krd S t|r&d S |jrF|jdsBd|jkrFd S t|tsbtd|j	 d|j
d krt|j}t|j|j}| | |j|_
tt|dd}| j|j
 | d| _t| j | _d S )Nztriton.z.triton.z
Function "zv" is being called from a Triton function but is not a Triton function itself. Decorate it with @triton.jit to fix thisnoinlineFr$   )r8   funcinspect	isbuiltin
__module__
startswithr:   JITFunctionAssertionErrorr6   hashr;   parser.   r    __globals__r+   strr=   r)   r'   r(   r*   )r-   r4   rC   treefinderrB   r   r   r   
visit_CallL   s     


zDependenciesFinder.visit_Call)	r6   rF   __qualname____doc__r&   r5   rA   rP   __classcell__r   r   r/   r   r    5   s
   r    c               
   C  sh  dd l } g }ttd}|t|  g7 }W 5 Q R X tj	t
d}| |gD ]>}t|j|jjd}|t|  g7 }W 5 Q R X qTttj	t
dd}|t|  g7 }W 5 Q R X tj	t
d}| |gD ]>}t|j|jjd}|t|  g7 }W 5 Q R X qt d }tt|dg }d	td | d d	| S )Nr   rbcompilerz_C/libtriton.solanguagez	--version-)pkgutilopen__file__r'   r(   readr*   ospathjoinTRITON_PATHiter_modulesmodule_finder	find_specnameoriginr   
subprocesscheck_outputTRITON_VERSION)rX   contentsfZcompiler_pathlibZlanguage_pathZptxasZptxas_versionr   r   r   version_keyc   s"    "$"$
rk   c                   @  s$   e Zd ZU ded< ddddZdS )KernelInterfacer   runr"   c                 C  s   t ttjt t| j|dS )z
        A JIT function is launched with: fn[grid](*args, **kwargs).
        Hence JITFunction.__getitem__ returns a callable proxy that
        memorizes the grid.
        )grid)r   r   	functoolspartialr   rm   )r-   rn   r   r   r   __getitem__   s    zKernelInterface.__getitem__N)r6   rF   rQ   __annotations__rq   r   r   r   r   rl   }   s   
rl   c                      s   e Zd ZdZdZedd Zedd Zedd Zed	d
 Z	dd Z
edd Zdd Zdd Zdd ZddddZddddZddddddZd d! Zd0d"d#Zed$d% Zd&d' Zd(d) Zd*d+ Z fd,d-Zd.d/ Z  ZS )1rH   N   c                 C  s   t | dr| jS t| trdS t| trVd| kr<| dkr<dS d| krP| dkrPdS d	S n2t| trdd
S | d krpd S tdt|  d|  d S )Ndtypei1i   ii32l            l    u64i64fp32zUnsupported type z for )hasattrrt   r:   boolintfloat	TypeErrortypeargr   r   r   _key_of   s    



zJITFunction._key_ofc                 C  s"   t | drt | jdr| jjS dS )Ndevicer   r7   )rz   r   r   r   r   r   r   
_device_of   s    
zJITFunction._device_ofc                 C  s"   t | drt| jtr|  S dS )N	is_pinnedF)rz   r:   r   r   r   r   r   r   _pinned_memory_of   s    
zJITFunction._pinned_memory_ofc                 C  sD   t | dr|  tj dkS t| tr:| d dk| dkfS | d kfS )Ndata_ptrr   rs      rz   r   rH   divisibilityr:   r|   r   r   r   r   _spec_of   s
    

zJITFunction._spec_ofc                   sR   dd   fddt |D }fddt |D }tdddgt|t|S )	Nc                 S  sD   t | dr|  tj dkS t| tr4| tj dkS | d kr@dS dS )Nr   r   TFr   )xr   r   r   is_divisible_by_16   s    

z3JITFunction._get_config.<locals>.is_divisible_by_16c                   s&   h | ]\}} |r|j kr|qS r   )do_not_specialize.0ir   r   r-   r   r   	<setcomp>   s      
 z*JITFunction._get_config.<locals>.<setcomp>c                   s:   h | ]2\}}t |tst |tr|d kr| jkr|qS )r   )r:   r{   r|   r   r   r-   r   r   r      s     
 
  
 Zinstance_descriptordivisible_by_16
equal_to_1)	enumerater   tuple)r-   argsr   r   r   r   r   _get_config   s    zJITFunction._get_configc                 C  sz   | d krdS t | dd }dddddd	d
dddddddddd}t| D ]}|||< qPt| t rl| S d||  S )Nz*i8.ru   Zfp8e4Zfp8e5Zfp8e4b15Zfp16Zbf16ry   Zfp64i8Zi16rv   rx   u8u16u32rw   )r{   Zfloat8e4Zfloat8e5Zfloat8e4b15Zfloat16Zbfloat16Zfloat32Zfloat64Zint8Zint16Zint32Zint64Zuint8Zuint16Zuint32Zuint64*)rM   splitlistvaluesr:   )keyZ	dtype_strZtysvr   r   r   _type_of   s.    
zJITFunction._type_ofc                   s    d  fddt|D }|S )N,c                   s   g | ]\}}  |qS r   )r   )r   r   kr   r   r   
<listcomp>   s     z/JITFunction._make_signature.<locals>.<listcomp>)r^   r   )r-   Zsig_key	signaturer   r   r   _make_signature   s    zJITFunction._make_signaturec                 C  s   t t| j|}|S r1   )dictzip
constexprs)r-   Zconstexpr_key	constantsr   r   r   _make_constants   s    zJITFunction._make_constantsc	              	   C  s   t jd krdS | jj}	| jj}
ddd t| j|d D }|	 d| d| d| d	}t|}G d
d d}t	|||||||d}t j||||
|	d|i|dddS )NF, c                 S  s   g | ]\}}| d | qS )z: r   r   rc   tyr   r   r   r      s     z*JITFunction._call_hook.<locals>.<listcomp>r   z[num_warps=z, num_stages=]()c                   @  s   e Zd Zdd ZdS )z.JITFunction._call_hook.<locals>.LegacyCompilerc                 S  s   || _ || _d S r1   )modulerc   )r-   r   rc   r   r   r   r&      s    z7JITFunction._call_hook.<locals>.LegacyCompiler.__init__N)r6   rF   rQ   r&   r   r   r   r   LegacyCompiler   s   r   )r   r   r   	num_warps
num_stagesextern_libsconfigsr   )r   reprfncompileZis_manual_warmupZalready_compiled)
rH   
cache_hookr   r6   rF   r^   r   	arg_namesrM   r   )r-   r   r   r   r   r   r   r   r   rc   r   Z	arg_reprsr   r   kwargsr   r   r   
_call_hook   s    
   zJITFunction._call_hookrM   r"   c                 C  s   | j |d}|dkrJd| dtj d| d| dtj d| d| d	S d
|krfd| dtj dS |dkrd| dtj d| dS dS d S )Nr7   (z.data_ptr() % z == 0) if hasattr(z,, "data_ptr")                         else (z % z == 0, z == 1) if isinstance(z,, int)                         else (False,)Tensorz == 0)r|   z == 1)z(False,))rr   r2   rH   r   r-   r   Zarg_annotationr   r   r   _get_arg_specialization_key  s    4z'JITFunction._get_arg_specialization_keyc                 C  sH   | j |d}d|kr | dS |dkr,dS |dkr8dS d| d	S d S )
Nr7   r   z.dtyper{   ru   r}   ry   z_key_of(r   )rr   r2   r   r   r   r   _get_arg_sig_key  s    
zJITFunction._get_arg_sig_keyz	List[str]z
List[bool])device_typespinned_memory_flagsr#   c                 C  sv   dd |D }d|kr.dd l }|jjr*dS dS tdd |D }tdd |D }|r^|r^dS t|dkrr|d S dS )	Nc                 S  s   g | ]}|d kr|qS )r7   r   r   Zdevice_typer   r   r   r     s      z5JITFunction._conclude_device_type.<locals>.<listcomp>r   r   hipc                 s  s   | ]}|d kV  qdS )cpuNr   r   r   r   r   	<genexpr>"  s     z4JITFunction._conclude_device_type.<locals>.<genexpr>c                 s  s   | ]
}|V  qd S r1   r   )r   Zpinned_memory_flagr   r   r   r   #  s     )r   versionr   allanylen)r-   r   r   r   Zis_cpuZis_pinned_memoryr   r   r   _conclude_device_type  s    z!JITFunction._conclude_device_typec                   s   fddt  jD } fddt  jD }d|}d fdd|D }dddd |D  d }ddd	d |D  d }d|}g }t |D ]$\}	}
|	 jkrq| |
g7 }qd|}d
dd  jD }ddd t j jD }d jj d| d| dt	|dkr4| d
nd dt	|dkrR| d
nd d| d| d| d| d| dddd  jD  d}t
 t  j j j j jttttd}t|| | jj S )Nc                   s    g | ]\}}| j kr| qS r   r   r   r   r   r   r   +  s     
 z.JITFunction._make_launcher.<locals>.<listcomp>c                   s    g | ]\}}| j kr| qS r   r   r   r   r   r   r   ,  s     
 r   c                   s   g | ]}  |qS r   )r   r   r   r   r   r   r   /  s     [c                 S  s   g | ]}d | dqS )z_device_of(r   r   r   r   r   r   r   0  s     ]c                 S  s   g | ]}d | dqS )z_pinned_memory_of(r   r   r   r   r   r   r   1  s     r   c                 S  s   g | ]}d | d| qS )"z": r   r   r   r   r   r   <  s     c                 s  s.   | ]&\}}|t jkr|n| d | V  qdS )z = NrD   _empty)r   rc   Zdfltr   r   r   r   =  s     z-JITFunction._make_launcher.<locals>.<genexpr>z
def r   z, grid=None, num_warps=4, num_stages=3, extern_libs=None, stream=None, warmup=False, device=None, device_type=None):
    from ..compiler import compile, CompiledKernel
    sig_key =  z,
    constexpr_key = r   r   z
    spec_key = aV  
    key = (version_key, sig_key, constexpr_key, spec_key, num_warps, num_stages, self.debug)
    if not extern_libs is None:
      key = (key, tuple(extern_libs.items()))
    assert num_warps > 0 and (num_warps & (num_warps - 1)) == 0, "num_warps must be a power of 2"
    assert grid is not None
    if callable(grid):
        grid = grid({z})
    grid_size = len(grid)
    grid_0 = grid[0]
    grid_1 = grid[1] if grid_size > 1 else 1
    grid_2 = grid[2] if grid_size > 2 else 1

    if device_type is None:
        device_types = [_device_type for _device_type in zW if _device_type != '']
        device_type = self._conclude_device_type(device_types, a  )

    device_backend = None
    if device_type not in ['cuda', 'hip']:
        device_backend = get_backend(device_type)
        if device_backend is None:
            raise ValueError('Cannot find backend for ' + device_type)

    if device is None:
        if device_type in ['cuda', 'hip']:
            device = get_current_device()
            set_current_device(device)
        else:
            device = device_backend.get_current_device()
            device_backend.set_current_device(device)
    if stream is None and not warmup:
        if device_type in ['cuda', 'hip']:
            stream = get_cuda_stream(device)
        else:
            stream = device_backend.get_stream()

    bin = cache[device].get(key, None)
    if bin is not None:
      if not warmup:
          bin.c_wrapper(grid_0, grid_1, grid_2, bin.num_warps, bin.shared, stream, bin.cu_function, CompiledKernel.launch_enter_hook, CompiledKernel.launch_exit_hook, bin, zt)
      return bin
    # kernel not cached -- compile
    else:
      # build dict of constant values
      args = [z]
      all_args = c                 S  s   g | ]
}| qS r   r   r   r   r   r   r   q  s     a  ,
      configs = self._get_config(*all_args),
      constants = self._make_constants(constexpr_key)
      constants.update({i: None for i, arg in enumerate(all_args) if arg is None})
      constants.update({i: 1 for i in configs[0].equal_to_1})
      # build kernel signature -- doesn't include specialized arguments
      signature = { i: self._type_of(_key_of(arg)) for i, arg in enumerate(all_args) if i not in self.constexprs }
      # build stub signature -- includes arguments that are specialized
      for i, arg in constants.items():
        if callable(arg):
          raise TypeError(f"Callable constexpr at index {i} is not supported")
      if not self._call_hook(key, signature, device, constants, num_warps, num_stages, extern_libs, configs):
        bin = compile(self, signature=signature, device=device, constants=constants, num_warps=num_warps, num_stages=num_stages, extern_libs=extern_libs, configs=configs, debug=self.debug, device_type=device_type)
        if not warmup:
            bin.c_wrapper(grid_0, grid_1, grid_2, bin.num_warps, bin.shared, stream, bin.cu_function, CompiledKernel.launch_enter_hook, CompiledKernel.launch_exit_hook, bin, *args)
        self.cache[device][key] = bin
        return bin
      return None
)rk   r   r-   r   r   r   r   cache__spec__r   r   r   )r   r   r^   r   r   r   arg_defaultsr   r6   r   rk   r   r   r   r   r   r   r   r   r   r   exec)r-   Zregular_argsZconstexpr_argsr   Zsig_keysr   r   Zconstexpr_keysZspecializationsr   r   Z	spec_keysZ	grid_argsZargs_signaturer.   scoper   r   r   _make_launcher*  sj    



,,2E
zJITFunction._make_launcherc                   sZ  |_ |j_|_t|}dd |j D _dd |j D _	t
dd j	D _|d krlg n|_fddjD _tt|_jjdd  _tt_d _g _d _tjd	d
dkrdn|_|_dd   fdd|j D _fddj D _  _!|j"_"|j#_#|j$_$|j_d S )Nc                 S  s   g | ]
}|j qS r   )rc   r   r   r   r   r   r     s     z(JITFunction.__init__.<locals>.<listcomp>c                 S  s   g | ]
}|j qS r   )defaultr   r   r   r   r     s     c                 s  s   | ]}|t jkV  qd S r1   r   r   r   r   r   r     s     z'JITFunction.__init__.<locals>.<genexpr>c                   s&   h | ]}t |tr j|n|qS r   )r:   rM   r   indexr   r   r   r   r     s     z'JITFunction.__init__.<locals>.<setcomp>defZTRITON_DEBUG01Tc                 S  s   t | tr| jS | S r1   )r:   r   r6   )r   r   r   r   <lambda>      z&JITFunction.__init__.<locals>.<lambda>c                   s   i | ]\}}| |qS r   r   r   )normalize_tyr   r   
<dictcomp>  s      z(JITFunction.__init__.<locals>.<dictcomp>c                   s$   g | ]\}}d |kr j |qS )Z	constexpr)r   r   r   r   r   r   r     s      )%r   rF   r   r   rD   r   
parametersr   r   r   r   Zhas_defaultsr   textwrapdedent	getsourcer.   findr   r   r   rJ   kernel_decoratorskernelr\   environr2   debugrB   rr   itemsr   r   rm   rR   r6   rL   )r-   r   r   r   r   rB   r   r   )r   r-   r   r&     s2    


zJITFunction.__init__c                 C  s<   | j d kr6t| j| jd}||   |jt  | _ | j S )N)r,   r.   )rJ   r    rL   r.   r8   rK   r+   rk   )r-   Zdependencies_finderr   r   r   	cache_key  s
    
zJITFunction.cache_keyc                 O  s   | j ttj||ddiS )NwarmupT)rm   map
MockTensor
wrap_dtyper-   r   r   r   r   r   r     s    zJITFunction.warmupc                 C  sH   t | j}t|t jstt|jdks.tt|jd t jsDt|S )Nr   r   )	r;   rK   r.   r:   ModulerI   r   bodyFunctionDef)r-   rN   r   r   r   rK     s
    zJITFunction.parsec                 O  s   t dd S )Nz:Cannot call @triton.jit'd outside of the scope of a kernel)RuntimeErrorr   r   r   r   __call__  s    zJITFunction.__call__c                   s2   |dkrd | _ tt| || |dkr.d | _d S )Nr   r.   )r   r%   rH   __setattr__rJ   )r-   rc   r9   r/   r   r   r     s
    zJITFunction.__setattr__c                 C  s   d| j  d| jj dS )NzJITFunction(:r   )r   r   r6   r   r   r   r   __repr__  s    zJITFunction.__repr__)NNNN)r6   rF   rQ   r   r   staticmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r&   propertyr   r   rK   r   r   r  rS   r   r   r/   r   rH      s8   




i
%
rH   JITFunction[T]r   r#   c                 C  s   d S r1   r   )r   r   r   r   jit  s    r  r   r   r   rB   zOptional[Iterable[int]]zOptional[bool]zCallable[[T], JITFunction[T]])r   r   rB   r#   c                 C  s   d S r1   r   r  r   r   r   r    s    )r   r   r   rB   	interpretzOptional[T]z4Union[JITFunction[T], Callable[[T], JITFunction[T]]])r   r   r   rB   r	  r#   c                  s4   ddd fdd}| dk	r,|| S |S dS )a<  
    Decorator for JIT-compiling a function using the Triton compiler.

    :note: When a jit'd function is called, arguments are
        implicitly converted to pointers if they have a :code:`.data_ptr()` method
        and a `.dtype` attribute.

    :note: This function will be compiled and run on the GPU. It will only have access to:

           * python primitives,
           * builtins within the triton package,
           * arguments to this function,
           * other jit'd functions

    :param fn: the function to be jit-compiled
    :type fn: Callable
    r   r  r  c                   s:   t | str$ddlm} || S t|  dS d S )Nr   )GridSelectorr  )callablerI   Zinterpreter.interpreterr
  rH   )r   r
  r   r   r	  rB   r   r   r   	decorator  s    zjit.<locals>.decoratorNr   )r   r   r   r   rB   r	  r  r   r  r   r    s    c                   @  s0   e Zd ZdZedd Zdd Zedd ZdS )	r   zr
    Can be used in place of real tensors when calling:
        kernel.warmup(MockTensor(torch.float32), ...)
    c                 C  s"   | j jdkr| jdkrt| S | S )Nrt   r   )r0   r6   rF   r   r   r   r   r   r   -  s
    zMockTensor.wrap_dtypec                 C  s
   || _ d S r1   )rt   )r-   rt   r   r   r   r&   4  s    zMockTensor.__init__c                   C  s   dS r   r   r   r   r   r   r   7  s    zMockTensor.data_ptrN)r6   rF   rQ   rR   r  r   r&   r   r   r   r   r   r   (  s   
r   c                   @  s2   e Zd Zdd Zdd Zdd Zddd	d
ZdS )TensorWrapperc                 C  s*   || _ || _|j| _|j| _| jj| _d S r1   )rt   baseZis_cudar   shape)r-   r  rt   r   r   r   r&   =  s
    zTensorWrapper.__init__c                 C  s
   | j  S r1   )r  r   r   r   r   r   r   D  s    zTensorWrapper.data_ptrc                 C  s   | j |S r1   )r  stride)r-   r   r   r   r   r  G  s    zTensorWrapper.striderM   r"   c                 C  s   d| j  d| j dS )NzTensorWrapper[r   r   )rt   r  r   r   r   r   __str__J  s    zTensorWrapper.__str__N)r6   rF   rQ   r&   r   r  r  r   r   r   r   r  <  s   r  c                 C  sV   t | tr*|| jjkr| jS t| j|S n(t| dr>t| |S tdt|  dd S )Nr   zCannot reinterpret a r   )r:   r  r  rt   rz   r~   r   )Ztensorrt   r   r   r   reinterpretN  s    


r  )N)N)/
__future__r   r   r;   ro   r'   rD   r\   re   r   collectionsr   r   typingr   r   r   r	   r
   r   r   r   r   Zcommon.backendr   r   r]   dirnameabspathrZ   r_   rg   r   r   r   r   r   NodeVisitorr    	lru_cacherk   rl   rH   r  r   r  r  r   r   r   r   <module>   sV   ,
.
  ^ 3