U
    -e                     @   s   d dl Z d dlmZmZmZ d dlZd dlmZ	 ddl
mZmZmZmZmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddl m!Z! eee" dddZ#G dd de!Z$G dd dZ%dS )    N)BinaryIOOptionalUnion   )AudioDatasetFeaturesImage
NamedSplitValueconfig)FeatureType_visit)query_table)_PACKAGED_DATASETS_MODULES)Parquet)logging)NestedDataStructureLikePathLike   )AbstractDatasetReader)featuresreturnc                    s6   t j tdd fdd}t| |  t jkr2dS  S )a  
    Get the writer_batch_size that defines the maximum row group size in the parquet files.
    The default in `datasets` is 1,000 but we lower it to 100 for image datasets.
    This allows to optimize random access to parquet file, since accessing 1 row requires
    to read its entire row group.

    This can be improved to get optimized size for querying/iterating
    but at least it matches the dataset viewer expectations on HF.

    Args:
        ds_config_info (`datasets.info.DatasetInfo`):
            Dataset info from `datasets`.
    Returns:
        writer_batch_size (`Optional[int]`):
            Writer batch size to pass to a dataset builder.
            If `None`, then it will use the `datasets` default.
    N)featurer   c                    sT   t | trt tj n8t | tr0t tj n t | trP| jdkrPt tj	 d S )Nbinary)

isinstancer	   minr   Z)PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETSr   Z)PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETSr   ZdtypeZ*PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS)r   
batch_size T/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/datasets/io/parquet.pyset_batch_size&   s    

z-get_writer_batch_size.<locals>.set_batch_size)npinfr   r   )r   r!   r   r   r    get_writer_batch_size   s    	
r$   c                	       sH   e Zd Zdee ee ee ee	e	ee
 d fddZdd Z  ZS )	ParquetDatasetReaderNF)path_or_pathssplitr   	cache_dirkeep_in_memory	streamingnum_procc           
   	      sd   t  j|f||||||d| t|tr0|n| j|i}td d }	tf ||||	d|| _d S )N)r'   r   r(   r)   r*   r+   parquetr   )r(   
data_filesr   hash)super__init__r   dictr'   r   r   builder)
selfr&   r'   r   r(   r)   r*   r+   kwargsr.   	__class__r   r    r0   5   s,    
zParquetDatasetReader.__init__c                 C   sZ   | j r| jj| jd}n>d }d }d }d }| jj||||| jd | jj| j|| jd}|S )N)r'   )download_configdownload_modeverification_mode	base_pathr+   )r'   r9   Z	in_memory)r*   r2   Zas_streaming_datasetr'   Zdownload_and_preparer+   Z
as_datasetr)   )r3   datasetr7   r8   r9   r:   r   r   r    readT   s&      zParquetDatasetReader.read)NNNFFN)__name__
__module____qualname__r   r   r   r
   r   strboolintr0   r<   __classcell__r   r   r5   r    r%   4   s          r%   c                   @   sL   e Zd Zdeeeef ee dddZ	edddZ
eeedd	d
ZdS )ParquetDatasetWriterN)r;   path_or_bufr   c                 K   s&   || _ || _|pt|j| _|| _d S )N)r;   rE   r$   r   r   parquet_writer_kwargs)r3   r;   rE   r   rF   r   r   r    r0   n   s    zParquetDatasetWriter.__init__)r   c              	   C   sv   | j r| j ntj}t| jtttjfrXt	| jd}| j
f ||d| j}W 5 Q R X n| j
f | j|d| j}|S )Nzwb+)file_objr   )r   r   ZDEFAULT_MAX_BATCH_SIZEr   rE   r@   bytesosr   open_writerF   )r3   r   bufferwrittenr   r   r    writez   s    $zParquetDatasetWriter.write)rG   r   r   c           
      K   s   d}| dd}| jjj}tj|fd|i|}tjtdt	| j|dt
  ddD ]H}t| jjt||| | jjdk	r| jjndd}	||	 ||	j7 }qV|  |S )	zWrites the pyarrow table as Parquet to a binary file handle.

        Caller is responsible for opening and closing the handle.
        r   rE   Nschemabaz"Creating parquet from Arrow format)unitdisabledesc)tablekeyindices)popr;   r   Zarrow_schemapqZParquetWriterr   ZtqdmrangelenZis_progress_bar_enabledr   _datasliceZ_indicesZwrite_tablenbytesclose)
r3   rG   r   rF   rM   _rO   writeroffsetbatchr   r   r    rK      s&    


zParquetDatasetWriter._write)N)r=   r>   r?   r   r   r   r   r   rB   r0   rN   rK   r   r   r   r    rD   m   s    

rD   )&rI   typingr   r   r   numpyr"   Zpyarrow.parquetr,   rX    r   r   r   r	   r
   r   r   Zfeatures.featuresr   r   Z
formattingr   Zpackaged_modulesr   Z packaged_modules.parquet.parquetr   utilsr   Zutils.typingr   r   abcr   rB   r$   r%   rD   r   r   r   r    <module>   s   $#9