U
    9%e                     @   s:   d dl mZ d dlmZ d dlZd dlmZ dddZdS )    )cuda)driverN)numpy_supportc                    s   t | dd}|sJ| j\}}| jj| | jjf}tjjj||f|| j|d}t	| j t
 j}ttdt|dd }t|| }||d ftj fdd}	t|jd | d t|jd | d f}
||f}|	|
||f | | |S )a  Compute the transpose of 'a' and store it into 'b', if given,
    and return it. If 'b' is not given, allocate a new array
    and return that.

    This implements the algorithm documented in
    http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/

    :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
        the device its stream will be used to perform the transpose (and to copy
        `b` to the device if necessary).
    streamr   )dtyper         c           	         s   t jj d}t jj}t jj}t jjt jj }t jjt jj }|| }|| }|| | jd k r|| | jd k r| || || f |||f< t 	  ||jd k r||jd k r|||f |||f< d S )N)shaper   r   r   )
r   ZsharedarrayZ	threadIdxxyZblockIdxZblockDimr	   Zsyncthreads)	inputoutputZtileZtxtyZbxZbyr   r   dtZ
tile_shape [/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/numba/cuda/kernels/transpose.pykernel)   s    $ztranspose.<locals>.kernel)getattrr	   r   itemsizer   ZcudadrvZdevicearrayZDeviceNDArraynpsZ
from_dtyper   Z
get_deviceZMAX_THREADS_PER_BLOCKintmathpowlogZjit)abr   colsrowsstridesZtpbZ
tile_widthZtile_heightr   blocksthreadsr   r   r   	transpose   s*    

,r#   )N)	Znumbar   Znumba.cuda.cudadrv.driverr   r   Znumba.npr   r   r#   r   r   r   r   <module>   s   