U
    dV2                     @   s   d Z ddlmZmZ ddlmZmZ ddlmZm	Z	m
Z
mZ ddlZG dd deZG dd	 d	eZG d
d deZdddZdd ZG dd deZdS )aH  
Implementation of an in-memory dataset with structured schema.

Use this to store and iterate through datasets with complex schema that
fit in memory.

Iterating through entries of this dataset is very fast since the dataset
is stored as a set of native Caffe2 tensors, thus no type conversion or
deserialization is necessary.
    )core	workspace)ReaderWriter)Structfrom_blob_listfrom_column_listInitEmptyRecordNc                   @   s.   e Zd ZdddZdd Zdd Zd	d
 ZdS )_DatasetReader   Fc                 C   s<   t | |  || _|p"|jd | _|| _|| _d| _dS )z7Don't call this directly. Instead, use dataset.reader()_cursorN)r   __init__contentdatasetname
batch_sizeenforce_batch_sizecursor)selfr   r   r   r    r   9/tmp/pip-unpacked-wheel-ua33x9lu/caffe2/python/dataset.pyr      s    z_DatasetReader.__init__c                 C   s,   | j d kr(|jg || j| jjd| _ d S Nfieldsr   ZCreateTreeCursorZNextScopedBlobr   r   r   r   init_netZexit_netr   r   r   setup_ex#   s    

z_DatasetReader.setup_exc              
   C   s   | j std| j }t|| jR |j| j g|	  |
 | j| jd}t|}||d g|fW  5 Q R  S Q R X d S )Nsetup not called.)r   r   r   )r   AssertionErrorr   r   r   	NameScopeNextNamer   ZReadNextBatchfield_blobsfield_namesr   r   output_to_listIsEmptyr   Zread_netr   r   r   r   r   read*   s    

z_DatasetReader.readc                 C   s   | | jgg  d S NZResetCursorr   r   netr   r   r   reset6   s    z_DatasetReader.resetN)r   F)__name__
__module____qualname__r   r   r'   r,   r   r   r   r   r
      s   
	r
   c                   @   s@   e Zd ZdddZdd Zdd Zd	d
 ZdddZdd ZdS )_DatasetRandomReaderr   Fc                 C   sH   t | |  || _d| _|p(|jd | _|| _|| _|| _|| _	dS )z>Don't call this directly. Instead, use dataset.random_reader()Nr   )
r   r   r   r   r   r   indicesr   	loop_overr   )r   r   r   r1   r   r2   r   r   r   r   r   ;   s    z_DatasetRandomReader.__init__c                 C   s,   | j d kr(|jg || j| jjd| _ d S r   r   r   r   r   r   r   G   s    

z_DatasetRandomReader.setup_exc                 C   s   | | jgg  d S r(   r)   r*   r   r   r   r,   N   s    z_DatasetRandomReader.resetc                 C   s2   |  | || jg| j   d}|| _d S )Noffsets)r,   ZComputeOffsetr   r   r   r"   r3   )r   r+   r3   r   r   r   computeoffsetQ   s    
z"_DatasetRandomReader.computeoffsetNc                 C   sh   | j  }d}|r4|| ks&td| |}| | |j| jg|  d|||d}|| _	d S )NzMust be valid field.r1   )sort_by_field_idxshuffle_sizer   )
r   r   r#   r   indexr,   ZSortAndShuffler   r"   r1   )r   r+   Zsort_by_fieldr7   r   r   r6   r1   r   r   r   sort_and_shuffleX   s     

z%_DatasetRandomReader.sort_and_shufflec              
   C   s   | j std| jstd| js*td| j }t|| j	^ |j
| j | j| jg|  | | j| j| jd}t|}||d g|fW  5 Q R  S Q R X d S )Nzsetup_ex not calledzsort_and_shuffle not calledzcomputeoffset not called)r   r   r2   r   )r   r   r1   r3   r   r   r   r    r!   r   ZReadRandomBatchr"   r#   r   r   r2   r$   r%   r&   r   r   r   r'   k   s     

z_DatasetRandomReader.read)r   FF)Nr   r   )	r-   r.   r/   r   r   r,   r4   r9   r'   r   r   r   r   r0   :   s     
    
r0   c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
_DatasetWriterc                 C   s   || _ d| _dS )z7Don't call this directly. Use dataset.writer() instead.N)_contentmutex)r   r   r   r   r   r   }   s    z_DatasetWriter.__init__c                 C   s   | j d kr|g | _ d S r(   )r<   ZCreateMutexr   r   r   r   r      s    
z_DatasetWriter.setup_exc                 C   sz   | j dk	std| j }t|t|ksDtdt|t|f |j|g | j d || j g| t| | dS )a  
        Add operations to `net` that append the blobs in `fields` to the end
        of the dataset. An additional operator will also be added that checks
        the consistency of the data in `fields` against the dataset schema.

        Args:
            writer_net: The net that will contain the Append operators.
            fields: A list of BlobReference to be appeneded to this dataset.
        Nr   zExpected %s fields, got %s.r   )	r<   r   r;   r"   lenZCheckDatasetConsistencyr#   ZAtomicAppendlist)r   Z
writer_netr   r"   r   r   r   write   s    

  z_DatasetWriter.writec                 C   s   dS )z+Commit is a no-op for an in-memory dataset.Nr   )r   Z
finish_netr   r   r   commit   s    z_DatasetWriter.commitN)r-   r.   r/   r   r   r?   r@   r   r   r   r   r:   |   s   r:   c                 C   sH   t | tjstdtj||d}| | j|d}t	t
|| |S )a  
    Create a 'constant' by first creating an external input in the given
    net, and then feeding the corresponding blob with its provided value
    in the current workspace. The name is automatically generated in order
    to avoid clashes with existing blob names.
    z net must be a core.Net instance.)dtype)prefix)
isinstancer   Netr   nparrayZAddExternalInputr!   r   ZFeedBlobstr)r+   valuerA   r   Zblobr   r   r   Const   s
    rI   c                 C   s,   t d}||gg  t j| ||dddS )N
report_netT   )rJ   Zconcurrent_substepsZreport_interval)r   rD   ZPrintZexecution_step)r   r   ZsubstepsZ	rows_readrJ   r   r   r   execution_step_with_progress   s    
rL   c                   @   sp   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd ZdddZdddZdddZdS )DatasetaS  Represents an in-memory dataset with fixed schema.

    Use this to store and iterate through datasets with complex schema that
    fit in memory.

    Iterating through entries of this dataset is very fast since the dataset
    is stored as a set of native Caffe2 tensors, thus no type conversion or
    deserialization is necessary.
    Nc                 C   sl   t |tst |tstdt |tr.t|}|| _| | _| | _|pNd| _	|
 rb| nd| _dS )a  Create an un-initialized dataset with schema provided by `fields`.

        Before this dataset can be used, it must be initialized, either by
        `init_empty` or `init_from_dataframe`.

        Args:
            fields: either a schema.Struct or a list of field names in a format
                    compatible with the one described in schema.py.
            name: optional name to prepend to blobs that will store the data.
        z<fields must be either a Struct or a list of raw field names.r   N)rC   r>   r   r   r   schemar#   r   field_typesr   Z	has_blobsr"   )r   r   r   r   r   r   r      s    



zDataset.__init__c                 C   s   |j | j| j| j|d dS )z}
        Trims the contents of this dataset so that the number of records is
        multiple of the given argument.
        )r   multiple_ofN)ZTrimDatasetr"   r   )r   r+   rP   r   r   r   trim   s    zDataset.trimc                 C   s   t || j  | _dS )zInitialize the blobs for this dataset with empty values.

        Empty arrays will be immediately fed into the current workspace,
        and `init_net` will take those blobs as external inputs.
        N)r	   rN   Zclone_schemar"   )r   r   r   r   r   
init_empty   s     zDataset.init_emptyc                    s8   t | jt  jkst fddt| jD | _dS )zInitialize the blobs for this dataset from a Pandas dataframe.

        Each column of the dataframe will be immediately fed into the current
        workspace, and the `net` will take this blobs as external inputs.
        c                    s*   g | ]"\}}t  |g |d qS ))r   )rI   Z	as_matrixflatten).0colfield	dataframer+   r   r   
<listcomp>   s   z/Dataset.init_from_dataframe.<locals>.<listcomp>N)r=   r   columnsr   	enumerater"   )r   r+   rX   r   rW   r   init_from_dataframe   s    zDataset.init_from_dataframec                 C   s   | st | jS )zx
        Return the list of BlobReference pointing to the blobs that contain
        the data for this dataset.
        )r   r"   r   r   r   r   	get_blobs   s    zDataset.get_blobsc                 C   s   t | j| jS )zi
        Return a Record of BlobReferences pointing to the full content of
        this dataset.
        )r   rN   r"   r]   r   r   r   r      s    zDataset.contentc                 C   s   | j S )z0Return the list of field names for this dataset.r   r]   r   r   r   r#     s    zDataset.field_namesc                 C   s   | j S )z
        Return the list of field dtypes for this dataset.

        If a list of strings, not a schema.Struct, was passed to the
        constructor, this will return a list of dtype(np.void).
        )rO   r]   r   r   r   rO     s    zDataset.field_typesr   Fc                 C   s4   | j stdt| |||}|dk	r0||d |S )a  Create a Reader object that is used to iterate through the dataset.

        This will append operations to `init_net` that create a TreeCursor,
        used to iterate through the data.

        NOTE: Currently, it is not safe to append to a dataset while reading.

        Args:
            init_net: net that will be run once to create the cursor.
            cursor_name: optional name for the blob containing a pointer
                         to the cursor.
            batch_size: how many samples to read per iteration.

        Returns:
            A _DatasetReader that can be used to create operators that will
            iterate through the dataset.
        Dataset not initialized.N)r"   r   r
   r   )r   r   cursor_namer   r   readerr   r   r   ra     s    zDataset.readerc                 C   s8   | j stdt| |||||}|dk	r4||d |S )a  Create a Reader object that is used to iterate through the dataset.

        NOTE: The reader order depends on the order in indices.

        Args:
            init_net: net that will be run once to create the cursor.
            indices: blob of reading order
            cursor_name: optional name for the blob containing a pointer
                         to the cursor.
            batch_size: how many samples to read per iteration.
            loop_over: repeat the dataset indefinitely (in the same order)

        Returns:
            A DatasetReader that can be used to create operators that will
            iterate through the dataset according to indices.
        r_   N)r"   r   r0   r   )r   r   r1   r`   r   r2   r   ra   r   r   r   random_reader.  s        zDataset.random_readerc                 C   s2   | j stdt|  }|dk	r.||d |S )a  Create a Writer that can be used to append entries into the dataset.

        NOTE: Currently, it is not safe to append to a dataset
              while reading from it.
        NOTE: Currently implementation of writer is not thread safe.
              TODO: fixme

        Args:
            init_net: net that will be run once in order to create the writer.
                      (currently not used)
        r_   N)r"   r   r:   r   r   )r   r   writerr   r   r   rc   H  s
    zDataset.writer)N)NNr   F)NNNr   FF)N)r-   r.   r/   __doc__r   rQ   rR   r\   r^   r   r#   rO   ra   rb   rc   r   r   r   r   rM      s$   

		  
      
rM   )NN)rd   Zcaffe2.pythonr   r   Zcaffe2.python.dataior   r   Zcaffe2.python.schemar   r   r   r	   ZnumpyrE   r
   r0   r:   rI   rL   objectrM   r   r   r   r   <module>   s   !B#
