U
    -e(i                     @   s  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZmZmZmZ ddlZddlmZ ddlmZ ddlmZmZ dd	lmZmZmZmZ dd
lmZ ddl m!Z! erddl"m#Z# ddl$m%Z%m&Z& e'e(Z)dZ*e+dedd  dej,Z-e+dZ.G dd de/Z0G dd de/Z1eddG dd dZ2d3e3ed ee3df ee3 ee3 e2dddZ4G d d! d!Z5G d"d# d#e5Z6G d$d% d%e5Z7eddG d&d' d'Z8eddG d(d) d)Z9d*d+ Z:d,d- Z;d.d/ Z<d0d1 Z=G d2d dZ>dS )4z Arrow ArrowReader.    N)	dataclass)Path)TYPE_CHECKINGListOptionalUnion   )DownloadConfig)	_split_refilenames_for_dataset_split)InMemoryTableMemoryMappedTableTableconcat_tables)logging)cached_path)DatasetInfo)Split	SplitInfoz=https://storage.googleapis.com/huggingface-nlp/cache/datasetsz
^
 (?P<split>z)
 (\[
    ((?P<from>-?\d+)
     (?P<from_pct>%)?)?
    :
    ((?P<to>-?\d+)
     (?P<to_pct>%)?)?
 \])?(\((?P<rounding>[^\)]*)\))?
$
z\s*\+\s*c                   @   s   e Zd ZdZdS )DatasetNotOnHfGcsErrorz?When you can't get the dataset from the Hf google cloud storageN__name__
__module____qualname____doc__ r   r   V/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/datasets/arrow_reader.pyr   A   s   r   c                   @   s   e Zd ZdZdS )MissingFilesOnHfGcsErrorz9When some files are missing on the Hf oogle cloud storageNr   r   r   r   r   r   G   s   r   T)frozenc                   @   s&   e Zd ZU dZeed< ee ed< dS )FileInstructionsa}  The file instructions associated with a split ReadInstruction.

    Attributes:
        num_examples: `int`, The total number of examples
        file_instructions: List[dict(filename, skip, take)], the files information.
            The filenames contains the relative path, not absolute.
            skip/take indicates which example read in the file: `ds.slice(skip, take)`
    num_examplesfile_instructionsN)r   r   r   r   int__annotations__r   dictr   r   r   r   r    M   s   
	r    r   ReadInstruction)namesplit_infosinstructionfiletype_suffixprefix_pathreturnc                    s  t ts tdtj ns,tddd |D }dd |D  fdd|D }t |tstt|}||}g }d}	|D ].}
||
j	 }||
j	 }|
j	 }|
j
dkrdn|
j
}|
jdkr|n|
j}|dkr|D ]&}|	|| 7 }	||||| d	 qqd}d}t||D ]\}}||7 }||k r||kr||krV|| nd}||k rp|| | nd
}|dkrq ||||d	 |	|d
kr|| n|7 }	||7 }q qt|	|dS )a  Returns instructions of the split dict.

    Args:
        name (`str`): Name of the dataset.
        split_infos (`list` of `[SplitInfo]`): Dataset splits information.
        instruction ([`ReadInstruction`] or `str`): Reading instruction for a dataset.
        filetype_suffix (`str`, *optional*): Suffix of dataset files, e.g. 'arrow' or 'parquet'.
        prefix_path (`str`, *optional*): Prefix of dataset files, e.g. directory name.

    Returns:
        [`FileInstructions`]
    zExpected str 'name', but got: zExpected non-empty str 'name'c                 S   s   i | ]}|j |jqS r   )r'   r!   .0infor   r   r   
<dictcomp>s   s      z*make_file_instructions.<locals>.<dictcomp>c                 S   s   i | ]}|j |jqS r   )r'   shard_lengthsr-   r   r   r   r0   t   s      c              
      s*   i | ]"}|j t|j  |j  d qS ))pathZdataset_namesplitr*   r1   )r'   r   r-   r*   r'   Zname2shard_lengthsr+   r   r   r0   u   s    r   N)filenameskiptaker   )r!   r"   )
isinstancestr	TypeErrortyper   
ValueErrorr&   	from_specto_absolute	splitnamefrom_toappendzipr    )r'   r(   r)   r*   r+   name2lenZname2filenamesZabsolute_instructionsr"   r!   Z	abs_instrZsplit_length	filenamesr1   r@   rA   r5   Zindex_startZ	index_endZshard_lengthr6   r7   r   r4   r   make_file_instructions\   sP    










rF   c                   @   s~   e Zd ZdZeed dddZdeddd	Zdedd
dZ	dd Z
dddZdee ed dddZedddZdS )
BaseReaderz@
    Build a Dataset object out of Instruction instance(s).
    r   r2   r/   c                 C   s   || _ || _d| _dS )zInitializes ArrowReader.

        Args:
            path (str): path where tfrecords are stored.
            info (DatasetInfo): info about the dataset.
        N)_path_info_filetype_suffixselfr2   r/   r   r   r   __init__   s    zBaseReader.__init__Fr,   c                 C   s   t dS )=Returns a Dataset instance from given (filename, skip, take).N)NotImplementedError)rM   filename_skip_take	in_memoryr   r   r   _get_table_from_filename   s    z#BaseReader._get_table_from_filenamec                 C   s   t |dkstdd |D s&tdg }t|}|D ]}tj| j|d |d< q8|D ]}| j	||d}|
| qZdd |D }|s| jd	ks| jjd	krtd
|ptjg t| jjjdg}t |dkrt|n|d }|S )a  Returns Dataset for given file instructions.

        Args:
            files: List[dict(filename, skip, take)], the files information.
                The filenames contain the absolute path, not relative.
                skip/take indicates which example read in the file: `ds.slice(skip, take)`
            in_memory (bool, default False): Whether to copy the data in-memory.
        r   c                 s   s   | ]}t |tV  qd S N)r8   r%   )r.   fr   r   r   	<genexpr>   s     z)BaseReader._read_files.<locals>.<genexpr>z&please provide valid file informationsr5   rS   c                 S   s   g | ]}t |d kr|qS )r   )len)r.   tr   r   r   
<listcomp>   s      z*BaseReader._read_files.<locals>.<listcomp>NzqTried to read an empty table. Please specify at least info.features to create an empty table with the right type.)schemar   )rY   allr<   copydeepcopyosr2   joinrI   rT   rB   rJ   featuresr   Zfrom_batchespar\   r;   r   )rM   filesrS   Z	pa_tablesrV   Zf_dictpa_tabler   r   r   _read_files   s"    	
 zBaseReader._read_filesc                 C   s    t |||| j| jd}|j}|S )z?Return list of dict {'filename': str, 'skip': int, 'take': int})r*   r+   )rF   rK   rI   r"   )rM   r'   r)   r(   r"   rd   r   r   r   get_file_instructions   s        z BaseReader.get_file_instructionsc                 C   s6   |  |||}|s&d| d}t|| j|||dS )a  Returns Dataset instance(s).

        Args:
            name (str): name of the dataset.
            instructions (ReadInstruction): instructions to read.
                Instruction can be string and will then be passed to the Instruction
                constructor as it.
            split_infos (list of SplitInfo proto): the available splits for dataset.
            in_memory (bool, default False): Whether to copy the data in-memory.

        Returns:
             kwargs to build a single Dataset instance.
        zInstruction "z" corresponds to no data!)rd   original_instructionsrS   )rg   r<   
read_files)rM   r'   Zinstructionsr(   rS   rd   msgr   r   r   read   s
    zBaseReader.readN)Nr&   r   )rd   rh   c                 C   sF   | j ||d}|dk	r0ddlm} |t|}nd}|| j|d}|S )aJ  Returns single Dataset instance for the set of file instructions.

        Args:
            files: List[dict(filename, skip, take)], the files information.
                The filenames contains the relative path, not absolute.
                skip/take indicates which example read in the file: `ds.skip().take()`
            original_instructions: store the original instructions used to build the dataset split in the dataset.
            in_memory (bool, default False): Whether to copy the data in-memory.

        Returns:
            kwargs to build a Dataset instance.
        rX   Nr   )r   )Zarrow_tabler/   r3   )rf   splitsr   r9   rJ   )rM   rd   rh   rS   re   r   r3   Zdataset_kwargsr   r   r   ri      s    zBaseReader.read_filesdownload_configc              
   C   sX  t d |tjd }z\tj|d}t|tjd}t|tj| j	d | j
dk	rp| j
| j
| j	 W n, tk
r } zt|dW 5 d}~X Y nX z| j
jD ]v}| j| j
j|| j
j d}|D ]P}	tt|	d | j	}
tj||
}t|tjd|d}t||	d  qqW n. tk
rR } zt|dW 5 d}~X Y nX dS )a%  
        Download the dataset files from the Hf GCS

        Args:
            dl_cache_dir: `str`, the local cache directory used to download files
            relative_data_dir: `str`, the relative directory of the remote files from
                the `datasets` directory on GCS.

        /zdataset_info.jsonN)r'   r)   r(   r5   rm   )HF_GCP_BASE_URLreplacer`   sepr2   ra   r   shutilmoverI   rJ   updatefrom_directoryFileNotFoundErrorr   rl   rg   Zbuilder_namevaluesr9   r   relative_tor   )rM   rn   Zrelative_data_dirZremote_cache_dirZremote_dataset_infoZdownloaded_dataset_infoerrr3   r"   Zfile_instructionZfile_to_downloadZremote_prepared_filenameZdownloaded_prepared_filenamer   r   r   download_from_hf_gcs  s4    


 zBaseReader.download_from_hf_gcs)F)F)F)NF)r   r   r   r   r9   r   rN   r   rT   rf   rg   rk   r   r%   r   ri   r	   r{   r   r   r   r   rG      s    
  rG   c                       sP   e Zd ZdZeed d fddZdeddd	Ze	dedd
dZ
  ZS )ArrowReaderz
    Build a Dataset object out of Instruction instance(s).
    This Reader uses either memory mapping or file descriptors (in-memory) on arrow files.
    r   rH   c                    s   t  || d| _dS )zInitializes ArrowReader.

        Args:
            path (str): path where Arrow files are stored.
            info (DatasetInfo): info about the dataset.
        arrowNsuperrN   rK   rL   	__class__r   r   rN   ?  s    zArrowReader.__init__FrO   c                 C   s   |d d|kr|d ndd|kr(|d nd  }}}t j||d}|dkrVt|| }|dk	r|dk	r|dkrz|t|ks|||}|S )rP   r5   r6   Nr7   rX   r   r   )r|   
read_tablerY   slice)rM   rR   rS   r5   r6   r7   tabler   r   r   rT   I  s    
$z$ArrowReader._get_table_from_filenamec                 C   s   |rt nt}|| S )z
        Read table from file.

        Args:
            filename (str): File name of the table.
            in_memory (bool, default=False): Whether to copy the data in-memory.

        Returns:
            pyarrow.Table
        )r   r   	from_file)r5   rS   Z	table_clsr   r   r   r   X  s    zArrowReader.read_table)F)F)r   r   r   r   r9   r   rN   r   rT   staticmethodr   __classcell__r   r   r   r   r|   9  s
   
r|   c                       s4   e Zd ZdZeed d fddZdd Z  ZS )ParquetReaderzv
    Build a Dataset object out of Instruction instance(s).
    This Reader uses memory mapping on parquet files.
    r   rH   c                    s   t  || d| _dS )zInitializes ParquetReader.

        Args:
            path (str): path where tfrecords are stored.
            info (DatasetInfo): info about the dataset.
        parquetNr~   rL   r   r   r   rN   n  s    zParquetReader.__init__c                 K   sv   |d d|kr|d ndd|kr(|d nd  }}}t j|dd}|dk	rr|dk	rr|dkrf|t|ksr|||}|S )rP   r5   r6   Nr7   T)Z
memory_mapr   )pqr   rY   r   )rM   rR   kwargsr5   r6   r7   re   r   r   r   rT   x  s    
$z&ParquetReader._get_table_from_filename)	r   r   r   r   r9   r   rN   rT   r   r   r   r   r   r   h  s   
r   c                   @   s*   e Zd ZU dZeed< eed< eed< dS )_AbsoluteInstructionz?A machine friendly slice: defined absolute positive boundaries.r?   r@   rA   N)r   r   r   r   r9   r$   r#   r   r   r   r   r     s   
r   c                   @   sb   e Zd ZU dZeed< dZee ed< dZ	ee ed< dZ
ee ed< dZee ed< dd	 ZdS )
_RelativeInstructionzHRepresents a single parsed slicing instruction, can use % and negatives.r?   Nr@   rA   unitroundingc                 C   s   | j d k	r| j dkrtd| jd k	r8| jdkr8td| j dkrT| jd k	rTtd| j dkr~| jd k	r~t| jdkr~td| j dkr| jd k	rt| jdkrtd| jd kr| j dkrd	n| j| jd
< d S )N)%abszunit must be either % or abs)closestZpct1_dropremainderz5rounding must be either closest or pct1_dropremainderr   zAIt is forbidden to specify rounding if not using percent slicing.d   z2Percent slice boundaries must be > -100 and < 100.r   r   )r   r<   r   r@   r   rA   __dict__rM   r   r   r   __post_init__  s    ""z"_RelativeInstruction.__post_init__)r   r   r   r   r9   r$   r@   r   r#   rA   r   r   r   r   r   r   r   r     s   
r   c                 C   s   t | }|std|  |ds0|dr4dnd}t|d|d|drbt|dnd	|d
r|t|d
nd	|dS )z)Returns ReadInstruction for given string.z!Unrecognized instruction format: Zfrom_pctZto_pctr   r   r3   r   fromNrA   )
split_namer   r@   rA   r   )_SUB_SPEC_REmatchr<   groupr&   r#   )specresr   r   r   r   _str_to_read_instruction  s    
r   c                 C   s&   |dk rd}t || t|d  S )Nr   zUsing "pct1_dropremainder" rounding on a split with less than 100 elements is forbidden: it always results in an empty dataset.      Y@)r<   mathtrunc)boundaryr!   rj   r   r   r   _pct_to_abs_pct1  s
    r   c                 C   s   t t| | d S )Nr   )r#   round)r   r!   r   r   r   _pct_to_abs_closest  s    r   c                 C   s4  | j dkrtnt}| j}||kr:td| dt| d|| }| j}| j}| jdkr|dkrddn|||}|dkrz|n|||}n |dkrdn|}|dkr|n|}t	||kst	||krd|pd	 d
|pd	 d| d}t||dk r|| }n|dkrd}|dk r|| }n||kr(d}t
|||S )zReturns _AbsoluteInstruction instance for given RelativeInstruction.

    Args:
        rel_instr: RelativeInstruction instance.
        name2len: dict {split_name: num_examples}.
    r   zUnknown split "z". Should be one of .r   Nr   zRequested slice [ :z] incompatible with z
 examples.)r   r   r   r?   r<   listr@   rA   r   r   r   )	rel_instrrD   Z
pct_to_absr3   r!   r@   rA   rj   r   r   r   _rel_to_abs_instr  s0    
 




r   c                   @   sb   e Zd ZdZdd Zedd ZdddZed	d
 Zdd Z	dd Z
dd Zdd Zdd ZdS )r&   a  Reading instruction for a dataset.

    Examples::

      # The following lines are equivalent:
      ds = datasets.load_dataset('mnist', split='test[:33%]')
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec('test[:33%]'))
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction('test', to=33, unit='%'))
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
          'test', from_=0, to=33, unit='%'))

      # The following lines are equivalent:
      ds = datasets.load_dataset('mnist', split='test[:33%]+train[1:-1]')
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
          'test[:33%]+train[1:-1]'))
      ds = datasets.load_dataset('mnist', split=(
          datasets.ReadInstruction('test', to=33, unit='%') +
          datasets.ReadInstruction('train', from_=1, to=-1, unit='abs')))

      # The following lines are equivalent:
      ds = datasets.load_dataset('mnist', split='test[:33%](pct1_dropremainder)')
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
          'test[:33%](pct1_dropremainder)'))
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
          'test', from_=0, to=33, unit='%', rounding="pct1_dropremainder"))

      # 10-fold validation:
      tests = datasets.load_dataset(
          'mnist',
          [datasets.ReadInstruction('train', from_=k, to=k+10, unit='%')
          for k in range(0, 100, 10)])
      trains = datasets.load_dataset(
          'mnist',
          [datasets.ReadInstruction('train', to=k, unit='%') + datasets.ReadInstruction('train', from_=k+10, unit='%')
          for k in range(0, 100, 10)])

    c                 C   s
   || _ d S rU   _relative_instructions)rM   relative_instructionsr   r   r   _init  s    zReadInstruction._initc                 C   s   |  | }|| |S )zCReturns ReadInstruction obj initialized with relative_instructions.)__new__r   )clsr   resultr   r   r   ,_read_instruction_from_relative_instructions  s    

z<ReadInstruction._read_instruction_from_relative_instructionsNc                 C   s   |  t|||||g dS )a  Initialize ReadInstruction.

        Args:
            split_name (str): name of the split to read. Eg: 'train'.
            rounding (str, optional): The rounding behaviour to use when percent slicing is
                used. Ignored when slicing with absolute indices.
                Possible values:
                 - 'closest' (default): The specified percentages are rounded to the
                     closest value. Use this if you want specified percents to be as
                     much exact as possible.
                 - 'pct1_dropremainder': the specified percentages are treated as
                     multiple of 1%. Use this option if you want consistency. Eg:
                         len(5%) == 5 * len(1%).
                     Using this option, one might not be able to use the full set of
                     examples, if the number of those is not a multiple of 100.
            from_ (int):
            to (int): alternative way of specifying slicing boundaries. If any of
                {from_, to, unit} argument is used, slicing cannot be specified as
                string.
            unit (str): optional, one of:
                '%': to set the slicing unit as percents of the split size.
                'abs': to set the slicing unit as absolute numbers.
        N)r   r   )rM   r   r   r@   rA   r   r   r   r   rN     s    zReadInstruction.__init__c                 C   sL   t |}t|}|s$td| t|d }tdd |dd D |S )aM  Creates a `ReadInstruction` instance out of a string spec.

        Args:
            spec (`str`):
                Split(s) + optional slice(s) to read + optional rounding
                if percents are used as the slicing unit. A slice can be specified,
                using absolute numbers (`int`) or percentages (`int`).

        Examples:

            ```
            test: test split.
            test + validation: test split + validation split.
            test[10:]: test split, minus its first 10 records.
            test[:10%]: first 10% records of test split.
            test[:20%](pct1_dropremainder): first 10% records, rounded with the pct1_dropremainder rounding.
            test[:-5%]+train[40%:60%]: first 95% of test + middle 20% of train.
            ```

        Returns:
            ReadInstruction instance.
        z&No instructions could be built out of r   c                 s   s   | ]}t |V  qd S rU   )r   )r.   subr   r   r   rW   V  s     z,ReadInstruction.from_spec.<locals>.<genexpr>r   N)r9   _ADDITION_SEP_REr3   r<   r   sum)r   r   subsr)   r   r   r   r=   9  s    
zReadInstruction.from_specc           
      C   s   g }| j D ]}|j}|jd k	s(|jd k	r|j}|j}|j}|j}|dkrL|nd}|d k	rdt|| nd}|d k	r|t|| nd}d| d| d}|dkr|d k	r|dkrd| dnd}	|||	 7 }|| q
d	|S )
Nr   r   [r   ]r   ()+)	r   r?   r@   rA   r   r   r9   rB   ra   )
rM   Zrel_instr_specsr   Zrel_instr_specr@   rA   r   r   Z	slice_strZrounding_strr   r   r   to_specX  s"    
&zReadInstruction.to_specc                 C   sj   t |tsd}t|| j}|j}|d jdkr\|d jdkr\| jd j|d jkr\td| || S )zEReturns a new ReadInstruction obj, result of appending other to self.zAReadInstruction can only be added to another ReadInstruction obj.r   r   zPIt is forbidden to sum ReadInstruction instances with different rounding values.)r8   r&   r:   r   r   r   r<   r   )rM   otherrj   Zself_risZ	other_risr   r   r   __add__l  s    
zReadInstruction.__add__c                 C   s   |   S rU   )r   r   r   r   r   __str__{  s    zReadInstruction.__str__c                 C   s   d| j  dS )NzReadInstruction(r   r   r   r   r   r   __repr__~  s    zReadInstruction.__repr__c                    s    fdd| j D S )aZ  Translate instruction into a list of absolute instructions.

        Those absolute instructions are then to be added together.

        Args:
            name2len (`dict`):
                Associating split names to number of examples.

        Returns:
            list of _AbsoluteInstruction instances (corresponds to the + in spec).
        c                    s   g | ]}t | qS r   )r   )r.   r   rD   r   r   r[     s     z/ReadInstruction.to_absolute.<locals>.<listcomp>r   )rM   rD   r   r   r   r>     s    zReadInstruction.to_absolute)NNNN)r   r   r   r   r   classmethodr   rN   r=   r   r   r   r   r>   r   r   r   r   r&     s   &


)NN)?r   r^   r   r`   rers   dataclassesr   pathlibr   typingr   r   r   r   Zpyarrowrc   Zpyarrow.parquetr   r   Zdownload.download_configr	   Znamingr
   r   r   r   r   r   r   utilsr   Zutils.file_utilsr   r/   r   rl   r   r   Z
get_loggerr   loggerrp   compileXr   r   ConnectionErrorr   r   r    r9   rF   rG   r|   r   r   r   r   r   r   r   r&   r   r   r   r   <module>   sl   


  
H /"