U
    d)                  	   @   s   d dl mZ d dlmZmZmZmZmZ d dlmZ d dl	Z	d dl
Zd dlZd dlZe  edZeej dd Zdd	 Zd
d Zejdd Zdd Zedkre  \ZZejrdnd Zedddd edge  e!ej"rej#nej$dZ%e&e% ee W 5 Q R X dS )    )
caffe2_pb2)	workspacecoreutilsrnn_cellmodel_helper)	recurrentNZ
lstm_benchc                 C   sz  t d|  td}|jg dd| d}|jg dd| d}t| td}||dgdg ||d	gd	g t	j
d
 g }t| D ]}	|	td| d  dkrtd|	|  |	dkr|st	j
d|d g|dd  n|}
t	j
j|
 t	j}|
d }|| }t	j
|
d | t	j}td| td	| t|  ||
d |
d   qt d |||fS )z&
    Fill a queue with input data
    z Generating T={} sequence batchesZgenerate_input_initZ
inputqueue   )Z	num_blobscapacityZ
labelqueueZgenerate_inputZscratchZ	label_scri+
  
   r   zGenerating data {}/{}NzFinished data generation)loginfoformatr   ZNetZCreateBlobsQueuer   
RunNetOnceZEnqueueBlobsnprandomseedrangemaxprintrandintZrandZastypefloat32int32FeedBlobProtoappend)TshapeZ
num_labelsfixed_shapeZgenerate_input_init_netqueuelabel_queueZgenerate_input_netentry_countstZrandom_shapeX
batch_sizeLlabels r'   @/tmp/pip-unpacked-wheel-ua33x9lu/caffe2/python/lstm_benchmark.pygenerate_data   sN    
      

"
r)   c                 C   s^  t jdd}|jdd\}}|j|d}|j|d}g }	| jdkrd }
d| jkrz| jsftd	| j}
t	d

|
 t| jD ]0}|jd
|d
|\}}|	||g qtj||||	| j| jg| j d| j| jdd|
d\}}}}d| jkr~t	d d|j _d|j _n`| jdkrp|jdd}	|jjg ||d tj|||	| j| jd| jd\}}}nds~td|j|d}|j||||gddg\}}| js||g |	D ]P}|j|| | j}| jdkr|| j9 }t |t!j"d| j#|gt!j$d q| j%rV|j j&D ](}|j'd r,t(j)|| j*| j+d! q,||fS )"NZ
LSTM_bench)nameseq_lengthstargetZ
input_datalabel)ownstaticZ
static_dagr/   z0Random input length is not static RNN compatiblezUsing static RNN of size {}zhidden_init_{}zcell_init_{}Zlstm1T)model
input_blobr+   initial_statesdim_indim_outscopememory_optimizationforward_onlyZdrop_statesZreturn_last_layer_onlyZstatic_rnn_unroll_sizeZdagzUsing DAG net type   Zcudnnhidden_init	cell_init)r   Z	cudnnlstm)r0   r1   r2   r3   r4   r5   
num_layersFzUnknown implementationweightssoftmaxlossr	   ZdtypeZRecurrentNetwork)Znum_threadsZmax_cuda_streams),r   ZModelHelpernetZAddExternalInputsZDequeueBlobsimplementationr   AssertionError
seq_lengthr   r   r   r;   extendr   ZLSTM	input_dim
hidden_dimr6   r7   r   typeZnum_workersparam_init_netZConstantFillZ
cudnn_LSTMZUniformFillZSoftmaxWithLossZFlattenZAddGradientOperatorsZCopyr   r   r   zerosr$   r   rnn_executorop
startswithr   Zset_rnn_executor_configZrnn_executor_num_threadsZrnn_executor_max_cuda_streams)argsr   r    Zinput_shaper0   r+   r,   r1   r&   Z
init_blobsr   ir9   r:   outputZlast_hidden_Z
last_stater<   r=   r>   Z	init_blobszrK   r'   r'   r(   create_modelC   s    



 
rR   c                 C   sB  | j | j }| j| j| jg}t|| j || j| j\}}}tdt	j
| jg| j t	jd t| |||\}}t|j t|j t }|| j }	d}
td t|j j | jrtd t }td|d d d  td	 t }t }td
|	| jD ]t}t| j|	| }|
|7 }
t|j j| t }td||	t	||||  ||  d d  |}qtdt	|d
d  t |  d d | jrdnd | jr6td t }td|d d d  |d |d kr6t d|d |d  t d t | S )Nr+   r?   r   z------ Warming up ------zMemory stats:zGPU memory:	{} MBZ	max_totali   z ------ Starting benchmark ------r	   z'Iter: {} / {}. Entries Per Second: {}k.d   r   z/Done. Total EPS excluding 1st iteration: {}k {}z (with RNN executor) totalz3Max usage differs from current total usage: {} > {}z.This means that costly deallocations occurred.)!	data_sizer$   rC   rE   r)   rF   r   r   r   r   arrayr   rR   r   rH   Z	CreateNetr@   timer   r   ZRunNetr   r*   gpur   ZGetGPUMemoryUsageStatsr   r   Ziters_to_reportminsumrJ   warning)rM   r   Zinput_blob_shaper   r    r!   r0   rO   
start_timeZ	num_itersZtotal_itersstatsZ	last_time	iterationZ
iters_onceZnew_timer'   r'   r(   
Caffe2LSTM   s|    





$
 
r`   c                 C   s   t | S )N)r`   )rM   r'   r'   r(   	Benchmark   s    ra   c                  C   s  t jdd} | jdtddd | jdtdd	d | jd
tddd | jdtddd | jdtddd | jdtddd | jdddd | jdtddd | jdddd | jdddd | jd dd!d | jd"td#d$d | jd%dd&d | jd'td d(d | jd)td d*d | S )+NzLSTM benchmark.)descriptionz--hidden_dimi   zHidden dimension)rG   defaulthelpz--input_dim(   zInput dimensionz--batch_size   zThe batch size.z--seq_length   zMax sequence lengthz--data_sizei@B z!Number of data points to generatez--iters_to_reportz&Number of iteration to report progressz--gpu
store_truezRun all on GPU)actionrd   z--implementationr.   z('cudnn', 'own', 'static' or 'static_dag'z--fixed_shapezLWhether to randomize shape of input batches. Static RNN requires fixed shapez--memory_optimizationz+Whether to use memory optimized LSTM or notz--forward_onlyz Whether to run only forward passz--num_layersr	   zNNumber of LSTM layers. All output dimensions are going to beof hidden_dim sizez--rnn_executorzWhether to use RNN executorz--rnn_executor_num_threadsz*Number of threads used by CPU RNN Executorz--rnn_executor_max_cuda_streamsz:Maximum number of CUDA streams used by RNN executor on GPU)argparseArgumentParseradd_argumentintstr)parserr'   r'   r(   GetArgumentParser   s    rp   __main__r	   Zcaffe2z--caffe2_log_level=0z#--caffe2_print_blob_sizes_at_exit=0z--caffe2_rnn_executor={}z--caffe2_gpu_memory_tracking=1r8   )'Zcaffe2.protor   Zcaffe2.pythonr   r   r   r   r   r   rj   Znumpyr   rX   loggingbasicConfig	getLoggerr   setLevelDEBUGr)   rR   r`   debugra   rp   __name__parse_known_argsrM   
extra_argsrJ   Zrnn_executor_optZ
GlobalInitr   ZDeviceOptionrY   ZGpuDeviceTypeZCPUZdeviceZDeviceScoper'   r'   r'   r(   <module>   sB   
,`F
]
 