U
    -e                     @   s   d dl Z ddlmZ ddlmZ ddlmZ dd Zeejejejdd	d
ZeejejejdddZ	G dd de j
jZG dd dZdS )    N   )jit)language)next_power_of_2c                 C   s4   | dkrdS | dkrdS | dkr$dS | dkr0dS d	S )
N            i      i          )nr   r   _/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/triton/ops/blocksparse/softmax.py	num_warps   s    r   )ROW_SIZE
BLOCK_SIZEIS_DENSEc                  C   s  t d}t d}t d}|t d | }t d|
| }t d|
| }||| d  }t |d }t |d }|| }||| | | 7 }||| | 7 }|rt d|
}nH|dt d t d |  }t j|| | ||k dd}|| | }||k }t j|| | |td d}|t j}|}||9 }|d k	r||| 7 }||| 7 }|| d | }|dk||k @ }t j|||  | |dd}||7 }|t j}t ||k|	@ td |}t 	|}t j
| | | ||d d S )Nr   r   r	   maskotherinf        r   )tl
program_idnum_programsarangeloadfloattofloat32wheresoftmaxstore) OutAZ	stride_xzLUTRextent	stride_zr	stride_hrscale	is_causalr   r   r   hmzhmlane_nblock_nheadersizeoffsetZoff_ansoff_lutstart_nr   aoutoff_lomask_lo
rel_logitsr   r   r   _blocksparse_softmax_fwd   sB    	


 

r?   c           '      C   s"  t d}t d}t d}|t d | }t d|| }t d|| }||| d  }t |d }t |d }|| | | }||| | 7 }||k }|||  | }|||  | }|rt d|}nD|dt d t d |  }t j|| | |dd} | | | }t j|| |dd}!|!t j}!t j|| |dd}"|"t j}"t ||k|@ |!|!k@ d|!}!|!|"t |!|" d  }#|d k	r|||
 7 }||| 7 }|	| d | }$|$dk|$|	k @ |@ }%t j	|||	  |$ |#|%d |#| }#| ||  | }&t j	|&| |#|d d S )Nr   r   r	   r   r   r   )
r   r   r   r   r   r    r!   r"   sumr$   )'ZDAZ
stride_zdxZDOutZstride_zdoutr%   Zstride_zoutr,   r'   ZDRr)   r*   r+   Z	stride_err-   r   r   r   r.   r/   r0   r1   r2   r3   r4   r5   r6   Zoff_mnr   ZAsZDOutsr7   r8   r9   r:   doutdar<   r=   ZDAsr   r   r   _blocksparse_softmax_bwdK   sD    


 
rC   c                   @   s0   e Zd Zedd Zedd Zedd ZdS )_softmaxc              	   C   s   t jg t j| jd}| }t| jd D ]*}t || |d d d d f df}q*|| }t 	|}t j
|d d dd|dd < | jddd d df }t j||fddd}	t |	|ft j|}
|
t| fS )	Ndtypedevicer   )dimr   F)as_tupler	   )torchZtensorZint64rG   clonerangeshapecatr@   Z
zeros_likeZcumsumZnonzerostackviewtypeZint32r    intmax)layoutblockrG   _emptysizesr.   Ztotal_sizesoffsetscolumnsr4   lutr   r   r   make_lut   s    (
z_softmax.make_lutc
                 C   s  |d k	r,t |tjr,|jjdks$t| }|jd }
|d |d | |
g}|d krXdn|j}|d krjdn| }t	|}t
| |||d|||d |d |d |||t||	t|d | || || _|| _|| _|| _|| _|| _|j| _|	| _|| _|S )Ncpur   r   )r   r   r   r   rH   r   r   r   r   )
isinstancerK   ZTensorrG   rR   AssertionErroritemrN   stride
empty_liker?   r   r   Zsave_for_backwardspdimsrV   maxlutr,   	rel_shaperel_stridesrF   	rel_dtypeis_denser-   )ctxr:   r,   r>   r-   rd   rV   r[   re   ri   Mgridrf   rg   r;   r   r   r   forward   sF    

      z_softmax.forwardc                 C   s   | j \}}d }| jd r.tj| j| j|jd}|jd }| jd | jd | j	 |f}t
|}t| ||d||d||d| j||| jd | jd | jd | jd | j| j	t| j| jt| jd |d d |d d d d d d d d d d d d d d fS )Nr   rE   r   r   rH   r	   r^   )Zsaved_tensorsZneeds_input_gradrK   Zzerosrf   rh   rG   rN   rd   rV   rc   rC   rb   r,   rg   r-   r   re   ri   r   )rj   rA   r;   r[   Zdrrk   rl   rB   r   r   r   backward   sT    



       
        z_softmax.backwardN)__name__
__module____qualname__staticmethodr\   rm   rn   r   r   r   r   rD      s   

%rD   c                   @   s(   e Zd Zd	ddZddddddZdS )
r#   Fc                 C   s8   |j | _|| _|| _t| j| j|\| _| _|| _d S )N)	rN   rd   rU   rV   rD   r\   r[   re   ri   )selfrU   rV   rG   ri   r   r   r   __init__   s
    zsoftmax.__init__g      ?N)r,   r>   r-   c                C   sL   |d k	r$|j |j kr$td|j  t||||| j| j| j| j| j	}|S )Nz$relative position embedding must be )	rF   
ValueErrorrD   applyrd   rV   r[   re   ri   )rs   r:   r,   r>   r-   r   r   r   __call__   s           zsoftmax.__call__)F)ro   rp   rq   rt   rw   r   r   r   r   r#      s   
r#   )rK    r   r   r   r   r   Z	constexprr?   rC   ZautogradFunctionrD   r#   r   r   r   r   <module>   s    6	;Z