U
    d
                 	   @   s2  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZmZmZmZmZ d dlmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d
d Zdd Z G dd de!Z"G dd de!Z#G dd de"Z$G dd de"Z%G dd de"Z&G dd de%Z'G dd de%Z(G dd de"Z)G dd de!Z*G d d! d!e"Z+G d"d# d#e"Z,G d$d% d%e,Z-G d&d' d'e,Z.d=d,d-Z/e 0e/e%Z1e 0e/e$Z2e 0e/e'Z3e 0e/e&Z4e 0e/e(Z5G d.d/ d/e"Z6d0d1 Z7d2d3 Z8d>d5d6Z9ej:d7dd)d)d*d)fd8d9Z;d?d:d;Z<e j0e<e1d<Z=dS )@    N)viewkeys)
caffe2_pb2)apply_dot_attentionapply_recurrent_attentionapply_regular_attentionapply_soft_coverage_attentionAttentionType)core	recurrent	workspacebrewscopeutils)ParameterSharing)ParameterTags)Initializer)ModelHelperc                 C   s8   | d krd S t | tr t| S t | tjs4td| S )NzUnknown blob reference type)
isinstancestrr	   ZScopedBlobReferenceZBlobReference	Exception)Zblob_reference_or_name r   :/tmp/pip-unpacked-wheel-ua33x9lu/caffe2/python/rnn_cell.py_RectifyName   s    

r   c                 C   s   | d krd S dd | D S )Nc                 S   s   g | ]}t |qS r   )r   .0ir   r   r   
<listcomp>+   s     z!_RectifyNames.<locals>.<listcomp>r   )Zblob_references_or_namesr   r   r   _RectifyNames(   s    r   c                   @   s   e Zd ZdZd#ddZedd Zejdd Zd	d
 Zd$ddZ	dd Z
d%ddZdd Zd&ddZdd Zdd Zdd Zdd Zdd Zdd  Zd!d" ZdS )'RNNCella  
    Base class for writing recurrent / stateful operations.

    One needs to implement 2 methods: apply_override
    and get_state_names_override.

    As a result base class will provice apply_over_sequence method, which
    allows you to apply recurrent operations over a sequence of any length.

    As optional you could add input and output preparation steps by overriding
    corresponding methods.
    NFc                 C   s   || _ g | _|| _|| _d S N)namerecompute_blobsforward_only_initializer)selfr    r"   initializerr   r   r   __init__;   s    zRNNCell.__init__c                 C   s   | j S r   r#   r$   r   r   r   r%   A   s    zRNNCell.initializerc                 C   s
   || _ d S r   r'   )r$   valuer   r   r   r%   E   s    c                 C   s   | j d k	r| j d | S |S )N/r    )r$   r    r   r   r   r   I   s    zRNNCell.scopec                 C   sL  |d kr>t | j$ | jd kr(td| j|}W 5 Q R X | ||}t| j|d}|j	dd\}}	t
t|t|  d |jj	dd |  D  }
| j||||
|	d}t|j j}|D ]}||kr|j| q|d kr|  d	 g}tj|j|j||fgtt|
|tt|
||	| j| j|| jd

}| ||}||fS )N3Either initial states or initializer have to be set)r    Zparam_modelinput_ttimestepzJNumber of initial state values provided doesn't match the number of statesc                 S   s   g | ]}|d  qS )_prevr   )r   sr   r   r   r   f   s    z/RNNCell.apply_over_sequence.<locals>.<listcomp>modelr-   seq_lengthsstatesr.      )
netZcell_netinputsZinitial_cell_inputslinksr.   r   r"   outputs_with_gradsZrecompute_blobs_on_backward)r   	NameScoper    r%   r   create_statesprepare_inputr   r6   ZAddScopedExternalInputsr   ZraiseIfNotEquallenget_state_names_applysetZProtoZexternal_outputZAddExternalOutputget_output_state_indexr
   Zrecurrent_netlistzipdictr"   r!   _prepare_output_sequence)r$   r2   r7   r3   initial_statesr9   Zpreprocessed_inputsZ
step_modelr-   r.   Zstates_prevr4   Zexternal_outputsstateZstates_for_all_stepsoutputr   r   r   apply_over_sequenceL   sb    
 
zRNNCell.apply_over_sequencec                 C   s2   |  ||}| |||||}| ||}||fS r   )r<   r?   _prepare_output)r$   r2   r-   r3   r4   r.   rH   r   r   r   apply   s        zRNNCell.applyc              
   C   sF   |  |||||}t| j | j|f| W  5 Q R  S Q R X dS )z
        This  method uses apply_override provided by a custom cell.
        On the top it takes care of applying self.scope() to all the outputs.
        While all the inputs stay within the scope this function was called
        from.
        N)_rectify_apply_inputsr	   r:   r    apply_override)r$   r2   r-   r3   r4   r.   extra_inputsargsr   r   r   r?      s    
    zRNNCell._applyc           
      C   sn   t |||g\}}}t |}|r>t| \}}t |}t||}t| jj}||||g}	d|krj|	| |	S )z
        Before applying a scope we make sure that all external blob names
        are converted to blob reference. So further scoping doesn't affect them
        rN   )r   rC   inspect
getargspecrM   rO   append)
r$   r-   r3   r4   r.   rN   Zextra_input_namesextra_input_sizes	arg_namesZ	rectifiedr   r   r   rL      s    


zRNNCell._rectify_apply_inputsc                 C   s   t ddS )a  
        A single step of a recurrent network to be implemented by each custom
        RNNCell.

        model: ModelHelper object new operators would be added to

        input_t: singlse input with shape (1, batch_size, input_dim)

        seq_lengths: blob containing sequence lengths which would be passed to
        LSTMUnit operator

        states: previous recurrent states

        timestep: current recurrent iteration. Could be used together with
        seq_lengths in order to determine, if some shorter sequences
        in the batch have already ended.

        extra_inputs: list of tuples (input, dim). specifies additional input
        which is not subject to prepare_input(). (useful when a cell is a
        component of a larger recurrent structure, e.g., attention)
        Abstract methodNNotImplementedError)r$   r2   r-   r3   r.   rN   r   r   r   rM      s    zRNNCell.apply_overridec                 C   s   |S )a  
        If some operations in _apply method depend only on the input,
        not on recurrent states, they could be computed in advance.

        model: ModelHelper object new operators would be added to

        input_blob: either the whole input sequence with shape
        (sequence_length, batch_size, input_dim) or a single input with shape
        (1, batch_size, input_dim).
        r   r$   r2   
input_blobr   r   r   r<      s    zRNNCell.prepare_inputc                 C   s   dS )zQ
        Return index into state list of the "primary" step-wise output.
        r   r   r(   r   r   r   rA      s    zRNNCell.get_output_state_indexc                    s    fdd   D S )zN
        Returns recurrent state names with self.name scoping applied
        c                    s   g | ]}  |qS r   r   )r   r    r(   r   r   r      s     z+RNNCell.get_state_names.<locals>.<listcomp>)get_state_names_overrider(   r   r(   r   r>      s    zRNNCell.get_state_namesc                 C   s   t ddS )a  
        Override this function in your custom cell.
        It should return the names of the recurrent states.

        It's required by apply_over_sequence method in order to allocate
        recurrent states for all steps with meaningful names.
        rU   NrV   r(   r   r   r   r[      s    z RNNCell.get_state_names_overridec                 C   s   t ddS )zO
        Specifies the dimension (number of units) of stepwise output.
        rU   NrV   r(   r   r   r   get_output_dim   s    zRNNCell.get_output_dimc                 C   s   ||    S )zE
        Allows arbitrary post-processing of primary output.
        rA   )r$   r2   r4   r   r   r   rJ      s    zRNNCell._prepare_outputc                 C   s   d|    }|| S )z
        Allows arbitrary post-processing of primary sequence output.

        (Note that state_outputs alternates between full-sequence and final
        output for each state, thus the index multiplier 2.)
        r5   r]   )r$   r2   state_outputsZoutput_sequence_indexr   r   r   rE     s    z RNNCell._prepare_output_sequence)NFN)NNN)N)N)__name__
__module____qualname____doc__r&   propertyr%   setterr   rI   rK   r?   rL   rM   r<   rA   r>   r[   r\   rJ   rE   r   r   r   r   r   .   s0   


   
C	 
 

r   c                   @   s   e Zd Zdd Zdd ZdS )LSTMInitializerc                 C   s
   || _ d S r   hidden_size)r$   rg   r   r   r   r&     s    zLSTMInitializer.__init__c                 C   s8   |j dtddd| jgd|j dtddd| jgdgS )NZinitial_hidden_stateConstantFill        )Zoperator_namer)   )
param_namer%   shapeZinitial_cell_state)create_paramr   rg   )r$   r2   r   r   r   r;     s    zLSTMInitializer.create_statesNr_   r`   ra   r&   r;   r   r   r   r   re     s   re   c                       s@   e Zd Zd fdd	ZdddZdd Zd	d
 Zdd Z  ZS )BasicRNNCellFNc           	         sF   t t| jf | || _|| _|| _|| _| jdkrBtd| j d S )N)relutanh2BasicRNNCell with unknown activation function (%s))superrn   r&   drop_states
input_sizerg   
activationRuntimeError)	r$   rt   rg   forget_biasmemory_optimizationrs   r%   ru   kwargs	__class__r   r   r&   '  s    
zBasicRNNCell.__init__c                 C   s4  |d }t j||d| j| jdd}t |||g| | jdkrN|j|d}	n(| jdkrh|j|d}	ntd| j |d k	r.|j	|d	}|jj
||gd
dd}
|jj||gddd}|jj|
ddd}|jj|ddd}|jj|	|gdddd}| jr|}	n*|jj||gdddd}|j||g|	}	|	fS )Nr   gates_tr5   dim_indim_outaxisrp   hidden_tro   rq   Ztimestep_gpuvalid_b   )	broadcast	invalid_bvalidfloat)toinvalidhidden_valid)r   r   hidden_invalid)r   fcrg   sumru   r6   ZTanhZRelurv   ZCopyFromCPUInputGTZLEZCastMulrs   ZAdd)r$   r2   r-   r3   r4   r.   rN   hidden_t_prevr|   r   r   r   r   r   r   r   r   r   r   rM   =  sn    		


       zBasicRNNCell.apply_overridec                 C   s    t j||| d| j| jddS Ni2hr5   r}   )r   r   r   rt   rg   rX   r   r   r   r<   z  s    zBasicRNNCell.prepare_inputc                 C   s   |  dfS )Nr   rZ   r(   r   r   r   r>     s    zBasicRNNCell.get_state_namesc                 C   s   | j S r   rf   r(   r   r   r   r\     s    zBasicRNNCell.get_output_dim)FNN)N)	r_   r`   ra   r&   rM   r<   r>   r\   __classcell__r   r   rz   r   rn   &  s       
=
rn   c                       sP   e Zd Zd fdd	ZdddZdd Zd	d
 Zdd Zdd Zdd Z	  Z
S )LSTMCellFNc                    s\   t t| jf d|i| |p&t|d| _|| _|| _t|| _|| _	|| _
d| j | _d S Nr%   rf      )rr   r   r&   re   r%   rt   rg   r   rw   rx   rs   
gates_sizer$   rt   rg   rw   rx   rs   r%   ry   rz   r   r   r&     s    

zLSTMCell.__init__c                 C   s   |\}}|}	| j }
|d k	rNt| \}}tj||gt| ddd}	|
t|7 }
tj||	d|
| jdd}t|||g| |d k	r|||||g}n||||g}|jj	|ddg| j
| j|d k	d\}}|j|| | jr|g| _||fS )	Ngates_concatenated_input_tr5   r   r|   r}   Zhidden_stateZ
cell_state)rw   rs   Zsequence_lengths)rg   rC   r   concatrB   r   r   r   r6   LSTMUnitrw   rs   AddExternalOutputsrx   r!   )r$   r2   r-   r3   r4   r.   rN   r   cell_t_prevfc_inputfc_input_dimextra_input_blobsrS   r|   r7   r   cell_tr   r   r   rM     sF    	
zLSTMCell.apply_overridec                 C   s   |  dd |  dd dS Nr   _w_bweightsbiasesrZ   r(   r   r   r   get_input_params  s    zLSTMCell.get_input_paramsc                 C   s   |  dd |  dd dS )Nr|   r   r   r   rZ   r(   r   r   r   get_recurrent_params  s    zLSTMCell.get_recurrent_paramsc                 C   s    t j||| d| j| jddS r   r   r   r   rt   r   rX   r   r   r   r<     s    zLSTMCell.prepare_inputc                 C   s   ddgS Nr   r   r   r(   r   r   r   r[     s    z!LSTMCell.get_state_names_overridec                 C   s   | j S r   rf   r(   r   r   r   r\     s    zLSTMCell.get_output_dim)FN)N)r_   r`   ra   r&   rM   r   r   r<   r[   r\   r   r   r   rz   r   r     s      
4
r   c                       s@   e Zd Zd fdd	ZdddZdd Zd	d
 Zdd Z  ZS )LayerNormLSTMCellFNc                    s\   t t| jf d|i| |p&t|d| _|| _|| _t|| _|| _	|| _
d| j | _d S r   )rr   r   r&   re   r%   rt   rg   r   rw   rx   rs   r   r   rz   r   r   r&     s    

zLayerNormLSTMCell.__init__c                 C   s   |\}}|}	| j }
|d k	rTt| \}}tj||gt| | ddd}	|
t|7 }
tj||	| d|
| jdd}t|||g| tj	|| d| d| jdd\}}}|j
j|||||g|  | j| jd	\}}|j
|| | jr|g| _||fS )
Nr   r5   r   r|   r}   gates_t_normr~   r   rw   rs   )rg   rC   r   r   rB   r   r   r   r   
layer_normr6   r   r>   rw   rs   r   rx   r!   )r$   r2   r-   r3   r4   r.   rN   r   r   r   r   r   rS   r|   _r   r   r   r   r   r?     sV    	
zLayerNormLSTMCell._applyc                 C   s   |  dd |  dd dS r   rZ   r(   r   r   r   r   J  s    z"LayerNormLSTMCell.get_input_paramsc                 C   s    t j||| d| j| jddS r   r   rX   r   r   r   r<   P  s    zLayerNormLSTMCell.prepare_inputc                 C   s   |  d|  dfS r   rZ   r(   r   r   r   r>   Z  s    z!LayerNormLSTMCell.get_state_names)FN)N)	r_   r`   ra   r&   r?   r   r<   r>   r   r   r   rz   r   r     s      
=
r   c                   @   s   e Zd ZdddZdS )
MILSTMCellNc                 C   s  |\}}|}	| j }
|d k	rTt| \}}tj||gt| | ddd}	|
t|7 }
tj||	| d|
| jdd}|j	| d| jgt
ddd	d
}|j	| d| jgt
ddd	d
}|j	| d| jgt
ddd	d
}|j	| d| jgt
ddd	d
}|jj|||g| ddd}|j||g| d}|jj|||g| ddd}t|||g| d}|jj|||||g| d| dg| j| jd\}}|j|| | jr|g| _||fS )Nr   r5   r   prev_tr}   alpharh         ?r)   rk   r%   beta1beta2bri   alpha_by_input_t_plus_beta_h&alpha_by_input_t_plus_beta_h_by_prev_tbeta_i_by_input_t_plus_br|   hidden_t_intermediater   r   )rg   rC   r   r   rB   r   r   r   r   rl   r   r6   ElementwiseLinearr   r   rw   rs   r   rx   r!   )r$   r2   r-   r3   r4   r.   rN   r   r   r   r   r   rS   r   r   beta_hbeta_ir   r   r   r   r|   r   r   r   r   r   r?   `  s    	





zMILSTMCell._apply)Nr_   r`   ra   r?   r   r   r   r   r   ^  s   	 r   c                   @   s   e Zd ZdddZdS )LayerNormMILSTMCellNc                 C   s  |\}}|}	| j }
|d k	rTt| \}}tj||gt| | ddd}	|
t|7 }
tj||	| d|
| jdd}|j	| d| jgt
ddd	d
}|j	| d| jgt
ddd	d
}|j	| d| jgt
ddd	d
}|j	| d| jgt
ddd	d
}|jj|||g| ddd}|j||g| d}|jj|||g| ddd}t|||g| d}tj|| d| d| jdd\}}}|jj|||||g| d| dg| j| jd\}}|j|| | jr|g| _||fS )Nr   r5   r   r   r}   r   rh   r   r   r   r   r   r   ri   r   r   r   r|   r   r   r   r   r   r   )rg   rC   r   r   rB   r   r   r   r   rl   r   r6   r   r   r   r   rw   rs   r   rx   r!   )r$   r2   r-   r3   r4   r.   rN   r   r   r   r   r   rS   r   r   r   r   r   r   r   r   r|   r   r   r   r   r   r   r?     s    	





zLayerNormMILSTMCell._apply)Nr   r   r   r   r   r     s   	 r   c                       sD   e Zd ZdZd fdd	ZdddZdd	 Zd
d Zdd Z  Z	S )DropoutCellz
    Wraps arbitrary RNNCell, applying dropout to its output (but not to the
    recurrent connection for the corresponding state).
    NFc                    sj   || _ || _d|kstd|d| _|| _tt| jf | |j	| _	|j
| _
|j| _|j| _d| _d S )Nis_testzArgument 'is_test' is requiredr   )internal_celldropout_ratioAssertionErrorpopr   	use_cudnnrr   r   r&   r<   rA   r>   r\   mask)r$   r   r   r   ry   rz   r   r   r&   4  s    zDropoutCell.__init__c                 C   s   | j ||||||S r   )r   r?   )r$   r2   r-   r3   r4   r.   rN   r   r   r   r?   I  s    	zDropoutCell._applyc                 C   s(   | j ||}| jd k	r$| ||}|S r   )r   rJ   r   _apply_dropout)r$   r2   r4   rH   r   r   r   rJ   [  s    
zDropoutCell._prepare_outputc                 C   s(   | j ||}| jd k	r$| ||}|S r   )r   rE   r   r   )r$   r2   r^   rH   r   r   r   rE   d  s    
z$DropoutCell._prepare_output_sequencec              	   C   sj   | j rf| jsft| jpdD tj||t|d| j	 t
| j | j| jd}|  j	d7  _	W 5 Q R X |S )N z_with_dropout_mask{})Zratior   r   r   )r   r"   r	   r:   r    r   dropoutr   formatr   r   r   r   )r$   r2   rH   r   r   r   r   m  s    zDropoutCell._apply_dropout)NF)N)
r_   r`   ra   rb   r&   r?   rJ   rE   r   r   r   r   rz   r   r   .  s      
		r   c                   @   s   e Zd Zdd Zdd ZdS )MultiRNNCellInitializerc                 C   s
   || _ d S r   )cells)r$   r   r   r   r   r&   }  s    z MultiRNNCellInitializer.__init__c                 C   st   g }t | jD ]`\}}|jd kr(tdtd|0 t|j ||j	| W 5 Q R X W 5 Q R X q|S )Nr,   layer_{})
	enumerater   r%   r   r	   r:   r   r    extendr;   )r$   r2   r4   r   cellr   r   r   r;     s    

(z%MultiRNNCellInitializer.create_statesNrm   r   r   r   r   r   |  s   r   c                       s\   e Zd ZdZd fdd	Zdd Zdd Zdd	d
Zdd Zdd Z	dd Z
dd Z  ZS )MultiRNNCella  
    Multilayer RNN via the composition of RNNCell instance.

    It is the responsibility of calling code to ensure the compatibility
    of the successive layers in terms of input/output dimensiality, etc.,
    and to ensure that their blobs do not have name conflicts, typically by
    creating the cells with names that specify layer number.

    Assumes first state (recurrent output) for each layer should be the input
    to the next layer.
    Nc                    s*  t t| jf | || _|dkr(g | _n|| _g }d}| jD ]&}|||   |t| 7 }q<g | _	g | _
tt| jd D ]<}|d | jkr| j	| | j
||  qg | _	g | _
q| j	t| jd  | j
|d  g | _t| jD ]$\}}| jt| ||  qt|| _dS )a  
        cells: list of RNNCell instances, from input to output side.

        name: string designating network component (for scoping)

        residual_output_layers: list of indices of layers whose input will
        be added elementwise to their output elementwise. (It is the
        responsibility of the client code to ensure shape compatibility.)
        Note that layer 0 (zero) cannot have residual output because of the
        timing of prepare_input().

        forward_only: used to construct inference-only network.
        Nr   r   r   )rr   r   r&   r   residual_output_layersrR   rA   r=   r>   output_connected_layersZoutput_indicesrangestate_namesr   r   maplayer_scoperr   r%   )r$   r   r   ry   Zoutput_index_per_layerZ
base_indexr   r   rz   r   r   r&     s8    

zMultiRNNCell.__init__c                    s    fdd}|S )Nc                    s   d j | S )Nz{}/layer_{}/{})r   r    r+   layer_idr$   r   r   helper  s    z)MultiRNNCell.layer_scoper.<locals>.helperr   )r$   r   r   r   r   r   r     s    zMultiRNNCell.layer_scoperc              
   C   sB   t |}t| jpd  | jd ||W  5 Q R  S Q R X d S )Nr   r   )r   r	   r:   r    r   r<   rX   r   r   r   r<     s    zMultiRNNCell.prepare_inputc                 C   sX  |  |||||\}}}}}dd | jD }t|t|ks@tg }d}	|}
t| jD ]\}}t| j td	| || }||	|	|  }|	|7 }	|dkr|
||
}n|
}|j||||||dkrdn|d}|t| jd kr4|||}|dkr0|| jkr0t|||
g| d	|}
n|}
|| W 5 Q R X W 5 Q R X qV|S )	z
        Because below we will do scoping across layers, we need
        to make sure that string blob names are convereted to BlobReference
        objects.
        c                 S   s   g | ]}t | qS r   )r=   r>   )r   r   r   r   r   r     s     z'MultiRNNCell._apply.<locals>.<listcomp>r   r   NrN   r   zresidual_output_{})rL   r   r=   r   r   r   r	   r:   r    r   r<   r?   rJ   r   r   r   r   )r$   r2   r-   r3   r4   r.   rN   Zstates_per_layerZnext_statesZstates_indexZlayer_inputr   Z
layer_cell
num_stateslayer_statesZprepared_inputZlayer_next_stateslayer_outputr   r   r   r?     s\           zMultiRNNCell._applyc                 C   s   | j S r   )r   r(   r   r   r   r>     s    zMultiRNNCell.get_state_namesc                 C   s>   d}| j d d D ]}|t| 7 }q|| j d  7 }|S )Nr   r   )r   r=   r>   rA   )r$   indexr   r   r   r   rA     s
    z#MultiRNNCell.get_output_state_indexc                 C   s   g }d}t | jD ]L\}}t| }|| jkrV||||  }|||}	||	 ||7 }qt|dkrt||| 	d}
n|d }
|
S )Nr   r   Zresidual_output)
r   r   r=   r>   r   rJ   rR   r   r   r   r$   r2   r4   Zconnected_outputsZstate_indexr   r   r   r   r   rH   r   r   r   rJ   $  s(    


zMultiRNNCell._prepare_outputc                 C   s   g }d}t | jD ]P\}}dt|  }|| jkrZ||||  }|||}	||	 ||7 }qt|dkrt||| 	d}
n|d }
|
S )Nr   r5   r   Zresidual_output_sequence)
r   r   r=   r>   r   rE   rR   r   r   r   r   r   r   r   rE   ;  s(    


z%MultiRNNCell._prepare_output_sequence)N)N)r_   r`   ra   rb   r&   r   r<   r?   r>   rA   rJ   rE   r   r   r   rz   r   r     s   2 
Dr   c                       sf   e Zd Z fddZdddZdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd Zdd Z  ZS )AttentionCellc	           
         sh   t t| jf |	 || _|| _|| _|| _|| _|| _d | _	|t
jt
jt
jt
jfksXt|| _|| _d S r   )rr   r   r&   encoder_output_dimencoder_outputsencoder_lengthsdecoder_celldecoder_state_dimweighted_encoder_outputsencoder_outputs_transposedr   Regular	RecurrentDotSoftCoverager   attention_typeattention_memory_optimization)
r$   r   r   r   r   r   r   r   r   ry   rz   r   r   r&   U  s     
zAttentionCell.__init__Nc                 C   s  | j tjkr*|d d }|d }|d }	n|d d }|d }|d ksJt| jj||||||| jfgd}
| j||
| _| j tj	krt
|| j| j| j| j| j| j|| jd	\}| _}n| j tjkrt|| j| j| j| j| j| j| jd\}| _}n| j tjkr.t|| j| j| j| j| j| jd\}| _}nV| j tjkrtt|| j| j| j| j| j| j| j|	| jd
\}| _}}ntd| j | jr| j| t|
|g }| j tjkr|| ||| j  |  d	|| j < |j!j"|  |S )
Nr   r   )	r2   r   r   r   decoder_hidden_state_tdecoder_hidden_state_dimr   )attention_weighted_encoder_context_t_prevr   )r2   r   r   r   r   r   r   r   )r2   r   r   r   r   r   r   )
r2   r   r   r   r   r   r   r   coverage_t_prevcoverage_weightsz!Attention type {} not implementedhidden_t_external)#r   r   r   r   r   r?   r   rJ   r   r   r   r   r   r   r    r   attention_weights_3dr   r   r   r   r   r   r   r   r   r!   r   rB   rR   ZCopyrA   r   r6   r   )r$   r2   r-   r3   r4   r.   rN   Zdecoder_prev_statesr   r   Zdecoder_states$attention_weighted_encoder_context_tZattention_blobs
coverage_trH   r   r   r   r?   r  s    	

zAttentionCell._applyc                 C   s   | j S r   )r   r(   r   r   r   get_attention_weights  s    z#AttentionCell.get_attention_weightsc                 C   st   | j d kr,tj|| j| ddddgd| _ | jd krf| jtjkrftj	|| j| d| j
| j
dd| _| j||S )Nr   r   r5   r   )Zaxesr   r}   )r   r   Z	transposer   r   r   r   r   r   r   r   r   r<   rX   r   r   r   r<     s(    

	zAttentionCell.prepare_inputc                 C   s   | j tjksttj|| j| d| j| jdd| _	|j
j|j
| jdgdgd}t dk	r~tt jr~|j
|d}|j
j|| d	d
dd}|S )aW  
        initial_coverage is always zeros of shape [encoder_length],
        which shape must be determined programmatically dureing network
        computation.

        This method also sets self.coverage_weights, a separate transform
        of encoder_outputs which is used to determine coverage contribution
        tp attention.
        r   r5   r}   r   r   )ZstartsZendsNZencoder_length_cpuinitial_coverageri   )r)   Zinput_as_shape)r   r   r   r   r   r   r   r   r   r   r6   ZSliceZShapeZCurrentDeviceScoper	   ZIsGPUDeviceTypeZdevice_typeZCopyGPUToCPUrh   )r$   r2   Zencoder_lengthr  r   r   r   build_initial_coverage  s:    
	
z$AttentionCell.build_initial_coveragec                 C   sP   t | j }| d||  < || d | jtjkrL|| d |S )Nr   r   r   )	rB   r   r>   r   rA   rR   r   r   r   )r$   r   r   r   r   r>   3  s    zAttentionCell.get_state_namesc                 C   s   | j | j S r   )r   r   r(   r   r   r   r\   =  s    zAttentionCell.get_output_dimc                 C   s
   | j  S r   )r   rA   r(   r   r   r   rA   @  s    z$AttentionCell.get_output_state_indexc              	   C   sV   | j tjkr|d }n|d }t| jp*d tj|| j|gddd}W 5 Q R X |S )Nr   r   r   states_and_context_combinationr5   r   )	r   r   r   r	   r:   r    r   r   r   )r$   r2   r4   Zattention_contextrH   r   r   r   rJ   C  s    
zAttentionCell._prepare_outputc              	   C   s   | j tjkr|d d }n|d d }| j||}| j tjkrVdt|  d  }ndt|  d  }t| j	pvd  t
j|||| gddd}W 5 Q R X |S )Nr   r5   r   r   r  r   )r   r   r   r   rE   r=   r>   r	   r:   r    r   r   )r$   r2   r^   Zdecoder_state_outputsZdecoder_outputZattention_context_indexrH   r   r   r   rE   S  s(    	z&AttentionCell._prepare_output_sequence)N)r_   r`   ra   r&   r?   r   r<   r  r>   r\   rA   rJ   rE   r   r   r   rz   r   r   S  s   $ 
y-
r   c                       s   e Zd Zd fdd	Z  ZS )LSTMWithAttentionCellFc                    sD   t |||	|
d|ddd}tt| j||||||||||d
 d S )N
{}/decoderFrt   rg   rw   rx   r    r"   rs   )
r   r   r   r   r   r    r   r   r   r"   )r   r   rr   r  r&   )r$   r   r   r   decoder_input_dimr   r    r   r   rw   lstm_memory_optimizationr   r"   r   rz   r   r   r&   r  s*    	
zLSTMWithAttentionCell.__init__)Fr_   r`   ra   r&   r   r   r   rz   r   r  p  s    r  c                       s   e Zd Zd fdd	Z  ZS )MILSTMWithAttentionCellFc                    sB   t ||||	d|ddd}tt| j||||||||
|d	 d S )Nr  Fr  )	r   r   r   r   r    r   r   r   r"   )r   r   rr   r  r&   )r$   r   r   r  r   r    r   r   rw   r	  r   r"   r   rz   r   r   r&     s(    	
z MILSTMWithAttentionCell.__init__)Fr
  r   r   rz   r   r    s    r  r   Fri   Tc                    s$  t |tk	rt |tk	r|g}t| g }t D ]P}| f |dkrF|n
||d  || ||
 dkrf|nd||d|}|| q2 dkrt|||dn|d }|dkr|nt||} fdd|D }|j|||||d\}}|r|d	 d  d }|	rt||	 |
 d
g }t|S )a  
    Adds a standard LSTM recurrent network operator to a model.

    cell_class: LSTMCell or compatible subclass

    model: ModelHelper object new operators would be added to

    input_blob: the input sequence in a format T x N x D
            where T is sequence size, N - batch size and D - input dimension

    seq_lengths: blob containing sequence lengths which would be passed to
            LSTMUnit operator

    initial_states: a list of (2 * num_layers) blobs representing the initial
            hidden and cell states of each layer. If this argument is None,
            these states will be added to the model as network parameters.

    dim_in: input dimension

    dim_out: number of units per LSTM layer
            (use int for single-layer LSTM, list of ints for multi-layer)

    outputs_with_grads : position indices of output blobs for LAST LAYER which
            will receive external error gradient during backpropagation.
            These outputs are: (h_all, h_last, c_all, c_last)

    return_params: if True, will return a dictionary of parameters of the LSTM

    memory_optimization: if enabled, the LSTM step is recomputed on backward
            step so that we don't need to store forward activations for each
            timestep. Saves memory with cost of computation.

    forget_bias: forget gate bias (default 0.0)

    forward_only: whether to create a backward pass

    drop_states: drop invalid states, passed through to LSTMUnit operator

    return_last_layer_only: only return outputs from final layer
            (so that length of results does depend on number of layers)

    static_rnn_unroll_size: if not None, we will use static RNN which is
    unrolled into Caffe2 graph. The size of the unroll is the value of
    this parameter.
    r   r   Nr  )r    r"   c                    s   g | ]}d  d  | qS )r   r   r   r   
num_layersr   r   r     s     z_LSTM.<locals>.<listcomp>r2   r7   r3   rF   r9   r   inputr
   )typerB   tupler=   r   rR   r   UnrolledCellrI   r   r   )Z
cell_classr2   rY   r3   rF   r~   r   r   r9   return_paramsrx   rw   r"   rs   Zreturn_last_layer_onlyZstatic_rnn_unroll_sizeZcell_kwargsr   r   r   r   resultr   r  r   _LSTM  sX    @


r  c                   @   s   e Zd Zdd ZdddZdS )r  c                 C   s   || _ || _d S r   )Tr   )r$   r   r  r   r   r   r&   6  s    zUnrolledCell.__init__Nc                    sx  | j   jj  fddt| jD dd}| jdkrD|g}|}g }td| jD ]}	d|	}
t|
di\ t	|
F j
jg d|	dgtjjttjd	}| j j||	 |||d
}W 5 Q R X W 5 Q R X || qXt| }fdd|D }tdd t||D tttt| }|D ]}j| g  q.tdfdd|D  | j }|fS )Nc                    s   g | ]}t  d | qS )z_timestep_{})r   r   r   )r7   r   r   r   G  s   z4UnrolledCell.apply_over_sequence.<locals>.<listcomp>r   r   r   ztimestep_{}r   r.   )r)   rk   ZdtypeZdevice_optionr1   c                    sZ   g | ]R} j jt|t|d  tdd d t|d  tdd d gd dd  qS )r   ztimestep_0/NZ_concatZ_concat_infor   )r6   ZConcatrB   r   r=   )r   Zfull_output)r2   r   r   r   b  s   	c                 s   s   | ]}|D ]
}|V  q
qd S r   r   )r   Z
state_pairrG   r   r   r   	<genexpr>u  s      z3UnrolledCell.apply_over_sequence.<locals>.<genexpr>zAdded 0 gradients for blobs:c                    s   g | ]} | qS r   r   r   )outputsr   r   r   }  s     )r   r<   r6   ZSplitr   r  r   r   r   r:   param_init_netrh   r	   ZDataTypeZINT32ZDeviceOptionr   ZCPUr?   rR   rC   r  r@   r=   ZZeroGradientloggingdebugrE   )r$   r2   r7   r3   rF   r9   Zsplit_inputsr4   
all_statestZ
scope_namer.   Zoutputs_without_gradr   Zfinal_outputr   )r7   r2   r  r   rI   :  sd    


   

	z UnrolledCell.apply_over_sequence)N)r_   r`   ra   r&   rI   r   r   r   r   r  5  s   
 r  c                  C   s"   ddddg} ddddg}| |d	S )
NZinput_gate_wZforget_gate_wZoutput_gate_wZcell_wZinput_gate_bZforget_gate_bZoutput_gate_bZcell_br   r   )weight_paramsbias_paramsr   r   r   GetLSTMParamNames  s    r"  c                    s   t  d }t  d }tD ]Љ  fdd|D }tg }|D ]}t||}qB fdd|D }tg }|D ]}	t||	}qv|   d }
|   d }t|
}t|}t|
||j	
tj t|||j	
tj qdS )z?
    Set the parameters of LSTM based on predefined values
    r   r   c                    s   g | ]}  |   qS r   flatten)r   w
input_typeparam_valuesr   r   r     s   z&InitFromLSTMParams.<locals>.<listcomp>c                    s   g | ]}  |   qS r   r#  )r   r   r&  r   r   r     s   N)r"  r   nparrayrR   r   Z	FetchBlobZFeedBlobZreshaperk   ZastypeZfloat32)Zlstm_pblobsr(  r   r!  Zweight_valuesZwmatr%  Zbias_valuesZbmr   Zweights_blobZ	bias_blobZ
cur_weightZ
cur_biasesr   r&  r   InitFromLSTMParams  s6    





r+  r   c
                     sN  t | t d t d  || || || || fdd}
   }   }d||d |   }jd|gtdtjd	}|d
ddd|dd}t d}|||g i }dD ]}i ||< |dkr|n|}|dkri }  D ]}t	d|D ]}||kr4|| n
|
|||}j
j|||g|f|||d| ||| kr~i || |< |j||gd|||gf|||d|}||| | |< qqq|\}}jj||||gdddddgfdtddi|\}}}}}j|||| W 5 Q R X |	r@||f}||||fS |||fS dS )a_  
    CuDNN version of LSTM for GPUs.
    input_blob          Blob containing the input. Will need to be available
                        when param_init_net is run, because the sequence lengths
                        and batch sizes will be inferred from the size of this
                        blob.
    initial_states      tuple of (hidden_init, cell_init) blobs
    dim_in              input dimensions
    dim_out             output/hidden dimension
    scope               namescope to apply
    recurrent_params    dict of blobs containing values for recurrent
                        gate weights, biases (if None, use random init values)
                        See GetLSTMParamNames() for format.
    input_params        dict of blobs containing values for input
                        gate weights, biases (if None, use random init values)
                        See GetLSTMParamNames() for format.
    num_layers          number of LSTM layers
    return_params       if True, returns (param_extract_net, param_mapping)
                        where param_extract_net is a net that when run, will
                        populate the blobs specified in param_mapping with the
                        current gate weights and biases (input/recurrent).
                        Useful for assigning the values back to non-cuDNN
                        LSTM.
    r   r   c                    st   | dkrn}|kr*|dkr$|n}n,| krD|dkr>n}ndsVt d|jjg d||| |gdS )Nr   r  Fzunknown parameter type {}zlstm_init_{}_{}_{})rk   )r   r   r  UniformFill)layerpnamer'  Zinput_weight_size_for_layerszr!  Zinput_bias_sizeZinput_weight_sizer2   Zrecurrent_bias_sizeZrecurrent_weight_sizeZupper_layer_input_weight_sizer   r   r   init  s"    zcudnn_LSTM.<locals>.initr   r   Zlstm_weightr,  )rk   r%   tagsZlstmr   r   ZlinearZCUDNN)rg   Zrnn_modebidirectionalr   Z
input_moder  ZengineZlstm_param_extractorr  r
   N)r-  r'  Z
param_typezlstm_{}_{}_{}Zlstm_outputZlstm_hidden_outputZlstm_cell_outputZlstm_rnn_scratchZlstm_dropout_statesseedi )r	   r:   r"  rl   r   r   ZWEIGHTZNetZAddExternalInputsr   r  ZRecurrentParamSetZRecurrentParamGetr   r6   r   randomrandintr   ) r2   rY   rF   r~   r   r   Zrecurrent_paramsZinput_paramsr  r  r1  Zfirst_layer_szZupper_layer_szZtotal_szr   Z	lstm_argsZparam_extract_netZparam_extract_mappingr'  pr.  jvaluesr   Zhidden_input_blobZcell_input_blobrH   Zhidden_outputZcell_outputZrnn_scratchZdropout_statesZparam_extractr   r0  r   
cudnn_LSTM  s    





 
   r:  )r   r   c                 C   s`   t ||||	|
|||||||d}|||g}|tjkrD|||  |j| ||||d\}}|S )a  
    Adds a LSTM with attention mechanism to a model.

    The implementation is based on https://arxiv.org/abs/1409.0473, with
    a small difference in the order
    how we compute new attention context and new hidden state, similarly to
    https://arxiv.org/abs/1508.04025.

    The model uses encoder-decoder naming conventions,
    where the decoder is the sequence the op is iterating over,
    while computing the attention context over the encoder.

    model: ModelHelper object new operators would be added to

    decoder_inputs: the input sequence in a format T x N x D
    where T is sequence size, N - batch size and D - input dimension

    decoder_input_lengths: blob containing sequence lengths
    which would be passed to LSTMUnit operator

    initial_decoder_hidden_state: initial hidden state of LSTM

    initial_decoder_cell_state: initial cell state of LSTM

    initial_attention_weighted_encoder_context: initial attention context

    encoder_output_dim: dimension of encoder outputs

    encoder_outputs: the sequence, on which we compute the attention context
    at every iteration

    encoder_lengths: a tensor with lengths of each encoder sequence in batch
    (may be None, meaning all encoder sequences are of same length)

    decoder_input_dim: input dimension (last dimension on decoder_inputs)

    decoder_state_dim: size of hidden states of LSTM

    attention_type: One of: AttentionType.Regular, AttentionType.Recurrent.
    Determines which type of attention mechanism to use.

    outputs_with_grads : position indices of output blobs which will receive
    external error gradient during backpropagation

    weighted_encoder_outputs: encoder outputs to be used to compute attention
    weights. In the basic case it's just linear transformation of
    encoder outputs (that the default, when weighted_encoder_outputs is None).
    However, it can be something more complicated - like a separate
    encoder network (for example, in case of convolutional encoder)

    lstm_memory_optimization: recompute LSTM activations on backward pass, so
                 we don't need to store their values in forward passes

    attention_memory_optimization: recompute attention for backward pass

    forward_only: whether to create only forward pass
    )r   r   r   r  r   r    r   r   rw   r	  r   r"   r  )r  r   r   rR   r  rI   )r2   Zdecoder_inputsZdecoder_input_lengthsZinitial_decoder_hidden_stateZinitial_decoder_cell_stateZ*initial_attention_weighted_encoder_contextr   r   r   r  r   r   r   r9   r   r	  r   rw   r"   r   rF   r   r  r   r   r   LSTMWithAttention3  s8    N

r;  c                 C   s   t  }|d t|ts$|f |S t|dkrF|d |d< |f |S t|dksZtd|dksjtdt|D ]N\}}|d|i |f |\}}}}|||||f|d|d  d	 qr||||fS )
Ncreate_lstmr   r   r   zdim_out list can't be emptyFz(return_params not supported for layeringz	_layer_{})rY   r~   rF   r   )	localsr   r   rB   r=   r   r   updater   )r2   rY   r3   rF   r~   r   r   r9   r  rx   rw   r"   rs   r<  paramsr   Z
output_dimrH   Zlast_outputr  Z
last_stater   r   r   _layered_LSTM  s,    



 
r@  )r<  )	Nr  FFri   FFTN)NNr   F)r  FFri   FFN)>	functoolsrP   r  Znumpyr)  r5  Zfuture.utilsr   Zcaffe2.protor   Zcaffe2.python.attentionr   r   r   r   r   Zcaffe2.pythonr	   r
   r   r   r   r   Z(caffe2.python.modeling.parameter_sharingr   Z%caffe2.python.modeling.parameter_infor   Z#caffe2.python.modeling.initializersr   Zcaffe2.python.model_helperr   r   r   objectr   re   rn   r   r   r   r   r   r   r   r   r  r  r  partialZLSTMZBasicRNNZMILSTMZLayerNormLSTMZLayerNormMILSTMr  r"  r+  r:  r   r;  r@  Zlayered_LSTMr   r   r   r   <module>   s    
 cehkdlN G  (.         
pO$      
 
o           
