U
    d                     @   sj   d Z ddlZddlmZ ddlmZ dd Zdd Zdd
dZ	dd Z
dd Zdd Zdd Zdd ZdS )aJ  muji.py does multi-gpu training for caffe2 with no need to change the c++
side code. Everything is defined on the computation graph level.

We support the following use cases:
  - 2 gpus, where peer access is enabled between them.
  - 4 gpus, where peer access are enabled between all of them.
  - 4 gpus, where peer access are enabled in two groups,
    between {1, 2} and {3, 4}
  - 8 gpus, where peer access are enabled in two groups,
    between {1, 2, 3, 4} and {5, 6, 7, 8}.
If above cases are not satisfied, a fallback function which does not rely on
peer access will be called.
    N)
caffe2_pb2)	workspacec                 C   s   t  }tj|_| |_|S )zVA utility function that returns a device option protobuf of the
  specified gpu id.
  )r   DeviceOptionr   ZGpuDeviceTypedevice_typeZ	device_id)Zgpu_iddevice_option r   6/tmp/pip-unpacked-wheel-ua33x9lu/caffe2/python/muji.pyOnGPU   s    r	   c                  C   s   t  } t j| _| S )N)r   r   ZCPUr   r   r   r   r   OnCPU!   s    r   _reducedc                 C   s  |dkrt tt|}t|t|kr@tdt|t|f t }t|dkr|jd dkrt|ddddf rt	| |||S t|dkr|jd dkrt|ddddf rt
| |||S t|dkr0|jd dkr0t|ddddf r0t|ddddf r0t| |||S t|dkrx|jd dkrxt|ddddf rxt| |||S t| |||S dS )zThe general Allreduce interface that reroutes the function calls.
    CPUs and AMD GPUs are not supported because
    GetGpuPeerAccessPattern is called to get gpu peer access pattern.
  Nz6gpu_indices length and blobs length mismatch: %d vs %d   r         )listrangelenRuntimeErrorr   ZGetGpuPeerAccessPatternshapenpall
Allreduce2
Allreduce4Allreduce4Group2
Allreduce8AllreduceFallback)netblobsreduced_affixgpu_indicespatternr   r   r   	Allreduce'   s$    44V:r!   c           
      C   sL   |\}}|\}}| j ||g|| t|d}|jg || t|d}	||	fS )zUAllreduce for 2 gpus.

  Algorithm: 0r <- 0 + 1, 1r <- 0r, where r means "reduced"
  r
   )Addr	   Copy)
r   r   r   r   abgpu_agpu_b	a_reduced	b_reducedr   r   r   r   @   s    r   c                 C   s   |\}}}}|\}}	}
}| j ||gt|| t|d}| j ||gt|| t|
d}|j ||t|d}|jg |t|
d}|jg t|| t|	d}|jg t|| t|d}||||fS )zAllreduce for 4 gpus.

  Algorithm: 2 level reduction.
      0r <- 0 + 1, 2r <- 2 + 3
      0r <- 0r + 2r
      2r <- 0r,
      1r <- 0r, 3r <- 2r
  r
   r"   strr	   r#   )r   r   r   r   r$   r%   cdr&   r'   gpu_cgpu_dr(   	c_reducedr)   	d_reducedr   r   r   r   P   s2    	



r   c                 C   s   |\}}}}|\}}	}
}| j ||gt|| t|d}| j ||gt|| t|
d}|jg t|d t|d}|j ||t|d}|jg |t|
d}|jg t|| t|	d}|jg t|| t|d}||||fS )zAllreduce for 4 gpus where peer access are enabled in {0,1} and {2,3}

  Algorithm: 2 level reduction.
      0r <- 0 + 1, 2r <- 2 + 3
      0r <- 0r + 2r
      2r <- 0r,
      1r <- 0r, 3r <- 2r
  r
   _copyr*   )r   r   r   r   r$   r%   r,   r-   r&   r'   r.   r/   r(   r0   Zc_reduced_copyr)   r1   r   r   r   r   x   s<    	




r   c                 C   sh  dgd }dD ]8}| j || ||d  g|| | t|| d||< qdD ]<}| j || ||d  gt|| | t|| d||< qL|d jg t|d d	 t|d
 d}|d
 j ||d
 t|d
 d|d
< |d
 jg |d t|d d|d< dD ],}||d  jg || t|| d||< qdD ]2}||d  jg || | t|| d||< q0|S )zAllreduce for 8 gpus.

  Algorithm: 3 level reduction.
      0r <- 0 + 1, 2r <- 2 + 3, 4r <- 4 + 5, 6r <- 6 + 7
      0r <- 0r + 2r, 4r <- 4r + 6r
      0r <- 0r + 4r
      4r <- 0r
      2r <- 0r, 6r <- 4r
      1r <- 0r, 3r <- 2r, 5r <- 4r, 7r <- 6r
  Nr   )r   r   r         r
   )r   r   r   r   r2   r   )r   r3   )r4            )r"   r	   r+   r#   )r   r   r   r   reducediZreduced_4_copyr   r   r   r      sR    











r   c                 C   s   dgt | }|dkr@| j|d |d | t|d d|d< n|d |d< |d d }tdt |D ]H}| j|| |t|d d}| j||d g|d t|d d|d< qftdt |D ],}| j|d || | t|| d||< q|S )zA fallback option for Allreduce with no assumption on p2p.

  Algorithm: a flat operation on gpu 0
      0r <- 0
      0r <- 0r + i for i in gpu_indices[1:]
      ir <- 0r for i in gpu_indices[1:]
  N r   r
   Z
_temp_copyr4   )r   r#   r	   r   r"   )r   r   r   r   r8   Z	temp_namer9   tempr   r   r   r      s6    






r   )r   N)__doc__Znumpyr   Zcaffe2.protor   Zcaffe2.pythonr   r	   r   r!   r   r   r   r   r   r   r   r   r   <module>   s   

(.<