U
    9%eY                     @  sP  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZmZ ddlmZmZmZmZmZmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, dd Z-dd Z.dd Z/dd Z0dd Z1dd Z2dd Z3e4 dd d!d"Z5dTd#ddd$d%d&d'Z6d$dd(d)d*Z7d+d, Z8d-d. Z9d#d$d$d$d/d0d1d2Z:d$d$d$d3d4d5Z;d6d7 Z<d8d9 Z=d:Z>d;Z?e>e>e?d<Z@d=ZAd>ZBeAeAeBd<ZCd?ZDd@dA ZEdBdC ZFe
dDdEdFgeG eG gdGZHdHdI ZIdJdK ZJdLdM ZKdNdO ZLdPdQ ZMG dRdS dSZNdS )U    )annotationsN)
namedtuple)Path)AnyTuple   )add_external_libscompile_ptx_to_cubinget_shared_memory_sizeirtranslate_llvmir_to_hsacotranslate_llvmir_to_ptxtranslate_triton_gpu_to_llvmir)get_backendpath_to_ptxas)OutOfResources)get_cache_manager)driver)JITFunctionget_cuda_streamget_current_deviceget_device_capabilityversion_key)extract   )ast_to_ttir)	make_stubc                 C  s*   t | j}|  |  ||  | S N)r   pass_managercontextenable_debugadd_inliner_passrun)modpm r%   W/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/triton/compiler/compiler.pyinline_triton_ir   s
    
r'   c                 C  s4   t | j}|  t|r&|| ||  | S r   )r   r   r   r    _is_cudaZadd_rewrite_tensor_pointer_passr"   r#   archr$   r%   r%   r&   ttir_compute_capability_rewrite'   s    

r+   c                 C  sl   t | } t| |} t| j}|  |  |  |  |	  |
  |  |  ||  | S r   )r'   r+   r   r   r   r    r!   Zadd_triton_combine_passZadd_canonicalizer_passZadd_reorder_broadcast_passadd_cse_passZadd_licm_passadd_symbol_dce_passr"   r)   r%   r%   r&   optimize_ttir2   s    

r.   c                 C  s,   t | j}|  || ||  | S r   )r   r   r   r    Z$add_convert_triton_to_tritongpu_passr"   )r#   	num_warpsr$   r%   r%   r&   ttir_to_ttgirB   s
    

r0   c                 C  s   t | j}|  |  |  t|tr8|| |  |	  |
| |  |	  |  |  |  |  |  ||  | S r   )r   r   r   r    Zadd_tritongpu_coalesce_passZ,add_tritongpu_remove_layout_conversions_pass
isinstanceintZ$add_tritongpu_accelerate_matmul_passZ(add_tritongpu_optimize_dot_operands_passZadd_tritongpu_pipeline_passZadd_tritongpu_prefetch_passZ(add_tritongpu_decompose_conversions_passZ'add_tritongpu_reorder_instructions_passr,   r-   r"   )r#   
num_stagesr*   r$   r%   r%   r&   optimize_ttgirJ   s$    



r4   c                 C  sP   |  D ]&\}}t|dks(t|dkr d S qt| t| t|  d S )Nr   )itemslenr   listkeysvalues)r#   Zlibsnamepathr%   r%   r&   _add_external_libs_   s    r<   c                 C  s2   |rt | | t|r"t| |dS t| ddS d S )NFr   T)r<   r(   r   )r#   extern_libsr*   r%   r%   r&   ttgir_to_llirf   s
    
r>   r2   )returnc                 C  s^   t | tsttt| d\}}|dkr2d| S |dkrBd| S |dkrRd| S tdd	S )
zK
    Get the highest PTX version supported by the current CUDA driver.
    .   P      F   
   ?   z'Triton only support CUDA 10.0 or higherN)r1   strAssertionErrormapr2   splitRuntimeError)cuda_versionmajorminorr%   r%   r&   ptx_get_versionr   s    rO   r   rG   )r#   r*   ptx_versionr?   c                 C  s&   |dkrt  \}}t|}t| ||S )zr
    Translate TritonGPU module to PTX code.
    :param mod: a TritonGPU dialect module
    :return: PTX code
    N)r   rO   r   )r#   r*   rP   _rL   r%   r%   r&   llir_to_ptx   s    
rR   )ptxr*   c                 C  s   t  \}}t| ||S )z
    Compile TritonGPU module to cubin.
    :param ptx: ptx code
    :param compute_capability: compute capability
    :return: str
    )r   r	   )rS   r*   ZptxasrQ   r%   r%   r&   ptx_to_cubin   s    
rT   c                 C  s   ddddddddg}| d	 }t d
|d	 }d| d }tjttj	
 d}i }d	}|D ]0}|| }	tj|	rb|	|dt| < |d	7 }qb|| }
tj|
r|
|dt| < |S )Nz	opencl.bczocml.bczockl.bczoclc_finite_only_off.bczoclc_daz_opt_off.bcz!oclc_correctly_rounded_sqrt_on.bczoclc_unsafe_math_off.bczoclc_wavefrontsize64_on.bcr   zgfx(\w+)Zoclc_isa_version_z.bczthird_party/rocm/lib/bitcode/Zlibrary_)researchgroupstriposr;   joinr   __file__parentresolveexistsrG   )r*   Z#gpu_arch_agnostic_bitcode_librariesgfx_archZgfx_arch_idZ!gpu_arch_specific_bitcode_libraryZbitcode_path_dirZamdgcn_bitcode_pathsiZbc_libZbc_pathZbc_gfx_pathr%   r%   r&   get_amdgcn_bitcode_paths   s0    	
ra   c                  C  s   zt jddd} t| d  }td|d 	d}|d }|d 	d	}|d }d
}t
|dkrdtd|d d d td|d d }|||gW S  tk
r   Y dS X dS )z
    get the amdgpu fulll ISA details for compiling:
    i.e., arch_triple: amdgcn-amd-amdhsa; arch_name: gfx906; arch_features: sramecc+:xnack-
    Z	ROCM_PATHz	/opt/rocm)defaultz/bin/rocminfozamd.*r   z--r   :    +z\w+z,-r   N)rY   getenv
subprocesscheck_outputdecoderU   rV   rW   rX   rJ   r6   BaseException)Zrocm_path_dirZrocminfoZgfx_arch_detailsZarch_tripleZarch_name_featuresZ	arch_nameZarch_featuresr%   r%   r&   get_amdgpu_arch_fulldetails   s    rl   zTuple[str, str])r#   r_   
gfx_triplegfx_featuresr?   c                 C  s   t | |||S )z
    Translate TritonGPU module to HSACO code based on full details of gpu architecture.
    :param mod: a TritonGPU dialect module
    :return:
        - AMDGCN code
        - Path to HSACO object
    )r   )r#   r_   rm   rn   r%   r%   r&   llir_to_amdgcn_and_hsaco   s    ro   )srcpatternr?   c                 C  s>   | st | dD ]&}| }||r| d   S qdS )zd
    Get kernel name from PTX code.
    This Kernel name is required when launching the kernel.
    
N)rH   rJ   rX   
startswith)rp   rq   liner%   r%   r&   get_kernel_name   s
    
rv   c                 C  s*   t d| }|d k	r&dt|d S | S )Nz!tt\.ptr<(.*)>*r   )rU   rV   convert_type_reprrW   )xmatchr%   r%   r&   rx      s    rx   c                   s   t | tr|d }|d }|dt }|dd}|dd}|dd	}d
d   fdd|D }	| j dd|  d|	 d| d| d| d| d| }
t|
	d
 S t | tsttt|  t  	d
 S )Nconfigs	signature	constantsr/      r3   re   debugFc                 S  s   t | jt | jfS r   )sorteddivisible_by_16
equal_to_1)confr%   r%   r&   <lambda>       zmake_hash.<locals>.<lambda>c                   s   g | ]} |qS r%   r%   ).0r   Zget_conf_keyr%   r&   
<listcomp>   s     zmake_hash.<locals>.<listcomp>-rd   zutf-8)r1   r   getdict	cache_keyrZ   r9   hashlibmd5encode	hexdigestrG   rH   r   	read_textr   )fnr*   kwargsr{   r|   r}   r/   r3   r   Zconfigs_keykeyr%   r   r&   	make_hash   s    
>r   z`^\s*tt\.func\s+(?:public\s+)?(@\w+)(\((?:%\w+: \S+(?: \{\S+ = \S+ : \S+\})?(?:, )?)*\))\s*\{\s*$z=\.(?:visible|extern)\s+\.(?:entry|func)\s+(\w+)\s*\(([^)]*)\))ttirttgirrS   z-%\w+: ([^,^\)\s]+)(?: \{\S+ = \S+ : \S+\})?,?z\.param\s+\.(\w+)z&"triton_gpu.num-warps"\s?=\s?(\d+)\s?:c                 C  s2   dd }i }| D ]}|| | r| | ||< q|S )Nc              	   S  s0   zt |  W dS  ttfk
r*   Y dS X d S )NTF)jsondumps	TypeErrorOverflowError)ry   r%   r%   r&   _is_jsonable  s
    
z-_get_jsonable_constants.<locals>._is_jsonabler%   )r}   r   Zserialized_constantsZconstantr%   r%   r&   _get_jsonable_constants  s    r   c                 C  s   t | |}||_|S r   )r   parse_mlir_moduler   )r;   r   moduler%   r%   r&   r   +  s    r   instance_descriptorr   r   )defaultsc                 C  s
   t | tS r   )r1   r2   r*   r%   r%   r&   r(   6  s    r(   c                 C  sl   zdd l }W n tk
r(   tdY nX | d krh|jjd krbt }t|} | d d | d  } nt } | S )Nr   z'Triton requires PyTorch to be installedrE   r   )torchImportErrorversionhipr   r   rl   )Z
capabilityr   devicer%   r%   r&   get_architecture_descriptor:  s    r   c                   s   | t|  t|D ]&}|| dks2|| d kr|| q| tjdd   d krdtddd  fddf|d< d S )	Nrd   ZMI_GPU_ARCHr   z gfx_arch is None (not specified)c                 S  s   t |  S r   r   r   r;   r%   r%   r&   r   T  r   z!add_rocm_stages.<locals>.<lambda>c                   s   t |  d d S )Nr   r   )ro   rp   r_   Zgfx_arch_full_detailsr%   r&   r   U  s   amdgcn)updatera   r7   poprY   environr   rK   )r*   r=   stagesr   r%   r   r&   add_rocm_stagesI  s    r   c                   s4   dd  fddf|d< dd  fddf|d< d S )Nc                 S  s   t |  S r   r   r   r%   r%   r&   r   \  r   z!add_cuda_stages.<locals>.<lambda>c                   s
   t |  S r   )rR   r   r   r%   r&   r   ]  r   rS   c                 S  s   t |  S r   )r   
read_bytesr   r%   r%   r&   r   ^  r   c                   s
   t |  S r   )rT   r   r   r%   r&   r   _  r   cubinr%   )r*   r=   r   r%   r   r&   add_cuda_stagesZ  s    

r   c           "   	     s*  | dd}t|}|dkr.t| dd  nt|}|s>t|jf | |dkoXt }|dkof| }t | dt | dd| d|r d	krd
nd| dt d krt | ddt }fddd f|d< fdd 	fddf|d< fdd fddf|d< dd  fddf|d< |rZt | n"|rnt	 | n|
 | ttr| dd |d 	d krt gtdkst|d< j}d}t	trdd  t	d!D 		|d< nttsttjd"\}	}
t }dd l}|t|
 ||j}|d|d }	|t|
 	}|
dkr|t|}t|dkstd#d|kst |d kstd$t |d d%d& |D }d'd  t|D 	t!|" #|
}|s|rt$|	}n|%|	}t&t' f|}ttrXjd }}ntjd"\}}d }| d(}|(|pi }| |}|d k	rt)|}t*+|}W 5 Q R X n<t, d)}|d*krd+|kstd,|d+ |d+< ||d< t!|" #|}t }}t!|- |d  D ]\}
\}}| d"|
 }|
|krd|}n| |}|d kr||}td-kr| d.} |.|d |||< |.|d | || < n|.||||< |.|| nJ|
d-kr| d.} | | }!|!d k	std/||||!f}n||}|
d0kr:|||
< n(|
d-krVt|d ||
< nt|||
< |
dkrd+|krt/||d+< |
d*krt0|d1d2|d3< |
d-krt0|d d4d2|d3< |d |d5< |s|s|1|
|||| |}q4|d kr|j.t*2||dd6||< |3|| t4|||S )7Ndevice_typecudar   r   ccr}   r/   r~   r3   K   re   r   r=   r   Fc                   s    S r   r%   r   )r   r%   r&   r   {  r   zcompile.<locals>.<lambda>astc                   s
   t |  S r   r   r   r   r%   r&   r   |  r   c              	     s   t t| d  d S )Nr   )r   r*   )r.   r   r   )r*   r{   r}   r   r|   r%   r&   r   }  r   r   c                   s
   t |  S r   r   r   r   r%   r&   r   ~  r   c                   s   t t|  S r   )r4   r0   r   )r*   r3   r/   r%   r&   r     r   r   c                 S  s   t |  S r   r   r   r%   r%   r&   r     r   c                   s   t |  S r   )r>   r   )r*   r=   r%   r&   r     r   Zllirr{   r|   r   r   c                 S  s   i | ]\}}||  qS r%   )rX   r   kvr%   r%   r&   
<dictcomp>  s      zcompile.<locals>.<dictcomp>,r@   z(Expected exactly one match for num_warpsz6num_warps in ttgir does not match num_warps in compilec                 S  s   g | ]}t |qS r%   )rx   )r   tyr%   r%   r&   r     s     zcompile.<locals>.<listcomp>c                 S  s   i | ]\}}||qS r%   r%   r   r%   r%   r&   r     s      z.json)r/   r3   r}   r   r*   rS   sharedz/ptx compilation must provide shared memory sizer   z.hsaco_pathz?Expected to have hsaco_path in metadata when we have the amdgcnr   z	// .globl)rq   r:   z.globl
hsaco_path)binary)5r   r   r   rH   r(   r   r   r   r   r   Z
add_stagesr1   r   r   r6   __name__rG   	enumeraterJ   rY   r;   basenamer   r   rU   rV   prototype_pattern	MULTILINErW   findallarg_type_patternttgir_num_warps_patternr2   r7   r8   indexr   Zmake_launcher_stubr   r   Z	get_groupopenr   loadr   r5   putr
   rv   Zadd_meta_infor   Z	put_groupCompiledKernel)"r   r   r   Z_device_backendZis_cudaZis_hipr   r:   Zfirst_stagerQ   Zir_namerp   rU   rz   typesZnum_warps_matchesZ	param_tysso_pathZfn_cache_managerextmetadataZmetadata_filenameZmetadata_groupZmetadata_pathfasmr   parseZcompile_kernelZir_filenameZnext_moduler;   Zextra_file_nameZ
hasco_pathr%   )
r*   r{   r}   r   r   r=   r   r3   r/   r|   r&   compileb  s   



$




"














r   c                      sF   e Zd ZdZdZdd Zdd Z fddZdd	 Zdd
dZ	  Z
S )r   Nc                 C  s   dd l }|jd|}|j|}|| _|j| t|d| _d|krP|d nd| _	|d | _
|d | _|d | _|d | _| jd	krt| jnd | _|| _|| _d | _d | _d S )
Nr   Z__triton_launcherZlaunchr   r/   r3   r}   r   r   )importlib.utilutilspec_from_file_locationmodule_from_specr   loaderexec_modulegetattr	c_wrapperr   r/   r3   r}   r   r   device_backendr   r   	cu_modulecu_function)selfr   r   r   r   	importlibspecr#   r%   r%   r&   __init__  s     



zCompiledKernel.__init__c           	      C  s   | j d k	rd S | jdkrNt }tjdtjditj }tj|d }tjj	}n8| j
sXt| j
 }| j
 }| j
|d }| j
 }| j|krt| j|d|| jd | j| | j|\}}}}|| _|| _|| _ || _d S )Nr   r   r   Zmax_shared_memzshared memoryr:   )r   r   r   r   ZHIPCUDAbackendutilsZget_device_propertiesZload_binaryr   rH   Zget_kernel_binZget_load_binary_fnr   r   r   r   n_spillsn_regsr   )	r   r   Zbin_pathZ
max_sharedZfn_load_binaryr#   funcr   r   r%   r%   r&   _init_handles$  s2    

  





$zCompiledKernel._init_handlesc                   s   |dkr|    t |S )Nr   )r   super__getattribute__)r   r:   	__class__r%   r&   r   A  s    zCompiledKernel.__getattribute__c                   s       d d fdd
}|S )N)streamc                   sf   | d kr*j dkrt } ntj d } j d  d  d jj| jtj	tj
f
|  d S )N)r   Zrocmr   r   r   )r   r   r   
get_streamr   r/   r   r   r   launch_enter_hooklaunch_exit_hook)r   argsgridr   r%   r&   runnerI  s    
$  z*CompiledKernel.__getitem__.<locals>.runner)r   )r   r   r   r%   r   r&   __getitem__F  s    zCompiledKernel.__getitem__c              	   C  sv   d| j kr| j d S t \}}z6t|d}|| j d  W 5 Q R X t||| _W 5 t| X | j| j d< | jS )Nsasswbr   )	r   tempfilemkstemprY   remover   writer   r   )r   Zfunfdr;   r   r%   r%   r&   get_sassS  s    

zCompiledKernel.get_sass)N)r   
__module____qualname__r   r   r   r   r   r   r  __classcell__r%   r%   r   r&   r     s   r   )N)O
__future__r   	functoolsr   r   rY   rU   rh   r  collectionsr   pathlibr   typingr   r   Z_C.libtriton.tritonr   r	   r
   r   r   r   r   Zcommon.backendr   r   Zruntime.autotunerr   Zruntime.cacher   Zruntime.driverr   Zruntime.jitr   r   r   r   r   Ztools.disasmr   Zcode_generatorr   Zmake_launcherr   r'   r+   r.   r0   r4   r<   r>   	lru_cacherO   rR   rT   ra   rl   ro   rv   rx   r   Zmlir_prototype_patternZptx_prototype_patternr   Zmlir_arg_type_patternZptx_arg_type_patternr   r   r   r   setr   r(   r   r   r   r   r   r%   r%   r%   r&   <module>   sv   $ %