U
    -eh                     @   s  d Z ddlZddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZmZmZmZmZ ddlZddlmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z%m&Z& e#e'Z(eG dd dZ)eG dd dZ*G dd de+Z,G dd de+Z-eG dd dZ.eG dd dZ/G dd dee0e/f Z1eG dd dZ2dS ) a   DatasetInfo and MetricInfo record information we know about a dataset and a metric.

This includes things that we know about the dataset statically, i.e.:
 - description
 - canonical location
 - does it have validation and tests splits
 - size
 - etc.

This also includes the things that can and should be computed once we've
processed the dataset as well:
 - number of examples (in each split)
 - etc.
    N)	dataclass)Path)ClassVarDictListOptionalUnion)DatasetCardDatasetCardData   )config)FeaturesValue)is_remote_filesystem)	SplitDict)TaskTemplatetask_template_from_dict)Version)
get_logger)asdictunique_valuesc                   @   s&   e Zd ZU dZeed< dZeed< dS )SupervisedKeysData inputoutputN)__name__
__module____qualname__r   str__annotations__r    r    r    N/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/datasets/info.pyr   9   s   
r   c                   @   s&   e Zd ZU dZeed< dZeed< dS )DownloadChecksumsEntryDatar   keyvalueN)r   r   r   r#   r   r   r$   r    r    r    r!   r"   ?   s   
r"   c                   @   s   e Zd ZdZdS )MissingCachedSizesConfigErrorz;The expected cached sizes of the download file are missing.Nr   r   r   __doc__r    r    r    r!   r%   E   s   r%   c                   @   s   e Zd ZdZdS )NonMatchingCachedSizesErrorz/The prepared split doesn't have expected sizes.Nr&   r    r    r    r!   r(   I   s   r(   c                   @   sJ   e Zd ZU dZee ed< dZee ed< dd Z	e
ed dddZdS )	PostProcessedInfoNfeaturesresources_checksumsc                 C   s(   | j d k	r$t| j ts$t| j | _ d S N)r*   
isinstancer   	from_dictselfr    r    r!   __post_init__R   s    zPostProcessedInfo.__post_init__)post_processed_info_dictreturnc                    s0   dd t | D  | f  fdd| D S )Nc                 S   s   h | ]
}|j qS r    name.0fr    r    r!   	<setcomp>Y   s     z.PostProcessedInfo.from_dict.<locals>.<setcomp>c                    s   i | ]\}}| kr||qS r    r    r7   kvfield_namesr    r!   
<dictcomp>Z   s       z/PostProcessedInfo.from_dict.<locals>.<dictcomp>dataclassesfieldsitems)clsr2   r    r=   r!   r.   W   s    zPostProcessedInfo.from_dict)r   r   r   r*   r   r   r   r+   dictr1   classmethodr.   r    r    r    r!   r)   M   s
   
r)   c                   @   s   e Zd ZU dZejedZeed< ejedZ	eed< ejedZ
eed< ejedZeed< dZee ed< dZee ed	< dZee ed
< dZeee  ed< dZee ed< dZee ed< dZee ed< dZeeeef  ed< dZee ed< dZee ed< dZee  ed< dZ!ee  ed< dZ"ee  ed< dZ#ee  ed< dddddgZ$e%ee  ed< dd Z&d7ee dddZ'd8ddZ(d d! Z)e*ed  d"d#d$Z+e*d9eee d d%d&d'Z,e*ed d(d)d*Z-d:d d,d-d.Z.d d/d0d1Z/ed/d2d3Z0e*ed d4d5d6Z1dS );DatasetInfoa
  Information about a dataset.

    `DatasetInfo` documents datasets, including its name, version, and features.
    See the constructor arguments and properties for a full list.

    Not all fields are known on construction and may be updated later.

    Attributes:
        description (`str`):
            A description of the dataset.
        citation (`str`):
            A BibTeX citation of the dataset.
        homepage (`str`):
            A URL to the official homepage for the dataset.
        license (`str`):
            The dataset's license. It can be the name of the license or a paragraph containing the terms of the license.
        features ([`Features`], *optional*):
            The features used to specify the dataset's column types.
        post_processed (`PostProcessedInfo`, *optional*):
            Information regarding the resources of a possible post-processing of a dataset. For example, it can contain the information of an index.
        supervised_keys (`SupervisedKeysData`, *optional*):
            Specifies the input feature and the label for supervised learning if applicable for the dataset (legacy from TFDS).
        builder_name (`str`, *optional*):
            The name of the `GeneratorBasedBuilder` subclass used to create the dataset. Usually matched to the corresponding script name. It is also the snake_case version of the dataset builder class name.
        config_name (`str`, *optional*):
            The name of the configuration derived from [`BuilderConfig`].
        version (`str` or [`Version`], *optional*):
            The version of the dataset.
        splits (`dict`, *optional*):
            The mapping between split name and metadata.
        download_checksums (`dict`, *optional*):
            The mapping between the URL to download the dataset's checksums and corresponding metadata.
        download_size (`int`, *optional*):
            The size of the files to download to generate the dataset, in bytes.
        post_processing_size (`int`, *optional*):
            Size of the dataset in bytes after post-processing, if any.
        dataset_size (`int`, *optional*):
            The combined size in bytes of the Arrow tables for all splits.
        size_in_bytes (`int`, *optional*):
            The combined size in bytes of all files associated with the dataset (downloaded files + Arrow files).
        task_templates (`List[TaskTemplate]`, *optional*):
            The task templates to prepare the dataset for during training and evaluation. Each template casts the dataset's [`Features`] to standardized column names and types as detailed in `datasets.tasks`.
        **config_kwargs (additional keyword arguments):
            Keyword arguments to be passed to the [`BuilderConfig`] and used in the [`DatasetBuilder`].
    default_factorydescriptioncitationhomepagelicenseNr*   post_processedsupervised_keystask_templatesbuilder_namedataset_nameconfig_nameversionsplitsdownload_checksumsdownload_sizepost_processing_sizedataset_sizesize_in_bytes_INCLUDED_INFO_IN_YAMLc                    s   j d k	r$t j ts$t j  _  jd k	rHt jtsHt j _ jd k	rt jtst jtrxt j _nt j _ j	d k	rt j	t
st
 j	 _	 jd k	rt jtst jttfrt j  _ntf  j _ jd k	rft jttfr,dd  jD }dd |D  _n:t jtrF jg _n t j}|d k	r`|gng  _ jd k	rt j _ j d k	r fdd jD  _d S )Nc                 S   s"   g | ]}t |tr|nt|qS r    )r-   r   r   r7   templater    r    r!   
<listcomp>   s   z-DatasetInfo.__post_init__.<locals>.<listcomp>c                 S   s   g | ]}|d k	r|qS r,   r    r\   r    r    r!   r^      s      c                    s   g | ]}|  jqS r    )Zalign_with_featuresr*   r\   r/   r    r!   r^      s    )r*   r-   r   r.   rN   r)   rT   r   r   rU   r   Zfrom_split_dictrO   r   tuplelistrP   r   r   )r0   Z	templatesr]   r    r/   r!   r1      s<    

zDatasetInfo.__post_init__F
deprecatedstorage_optionsc           	   	   C   s   |dkrt dt |j}tj||d}|d }t| }|rFtjj	nt
j	}|||tjd}| j||d W 5 Q R X | jr|||tjd}| | W 5 Q R X dS )ah  Write `DatasetInfo` and license (if present) as JSON files to `dataset_info_dir`.

        Args:
            dataset_info_dir (`str`):
                Destination directory.
            pretty_print (`bool`, defaults to `False`):
                If `True`, the JSON will be pretty-printed with the indent level of 4.
            fs (`fsspec.spec.AbstractFileSystem`, *optional*):
                Instance of the remote filesystem used to download the files from.

                <Deprecated version="2.9.0">

                `fs` was deprecated in version 2.9.0 and will be removed in 3.0.0.
                Please use `storage_options` instead, e.g. `storage_options=fs.storage_options`.

                </Deprecated>

            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.

                <Added version="2.9.0"/>

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.info.write_to_directory("/path/to/directory/")
        ```
        ra   'fs' was deprecated in favor of 'storage_options' in version 2.9.0 and will be removed in 3.0.0.
You can remove this warning by passing 'storage_options=fs.storage_options' instead.rb   r   wb)pretty_printN)warningswarnFutureWarningrc   fsspecget_fs_token_pathsr   ospathjoin	posixpathopenr   DATASET_INFO_FILENAME
_dump_inforM   LICENSE_FILENAME_dump_license)	r0   dataset_info_dirrf   fsrc   fs_token_pathsis_local	path_joinr8   r    r    r!   write_to_directory   s    !
zDatasetInfo.write_to_directoryc                 C   s*   | tjt| |rdnddd dS )zQDump info in `file` file-like object open in bytes mode (to support remote files)   Nindentutf-8)writejsondumpsr   encode)r0   filerf   r    r    r!   rr   
  s    zDatasetInfo._dump_infoc                 C   s   | | jd dS )zTDump license in `file` file-like object open in bytes mode (to support remote files)r~   N)r   rM   r   )r0   r   r    r    r!   rt     s    zDatasetInfo._dump_license)dataset_infosc           
   	   C   s   dd |D }d tdd |D  }d tdd |D  }d tdd |D  }d tdd |D  }d }d }d }d	d |D }	t|	d
krtt|	d j|	d
d   }nt|	rtt|	d }|r|nd }| |||||||dS )Nc                 S   s   g | ]}|d k	r|  qS r,   )copy)r7   	dset_infor    r    r!   r^     s      z*DatasetInfo.from_merge.<locals>.<listcomp>z

c                 s   s   | ]}|j V  qd S r,   )rJ   r7   infor    r    r!   	<genexpr>  s     z)DatasetInfo.from_merge.<locals>.<genexpr>c                 s   s   | ]}|j V  qd S r,   )rK   r   r    r    r!   r     s     c                 s   s   | ]}|j V  qd S r,   )rL   r   r    r    r!   r     s     c                 s   s   | ]}|j V  qd S r,   )rM   r   r    r    r!   r     s     c                 S   s   g | ]}|j d k	r|j qS r,   )rP   r   r    r    r!   r^     s     
 r   r   )rJ   rK   rL   rM   r*   rO   rP   )rn   r   striplenr`   setintersection)
rD   r   rJ   rK   rL   rM   r*   rO   rP   Zall_task_templatesr    r    r!   
from_merge  s.     zDatasetInfo.from_merge)ru   rc   r3   c           	   	   C   s   |dkrt dt |j}tj||d}|d }td|  |sLtdt	| }|rbt
jjntj}|j||tjddd	}t|}W 5 Q R X | |S )
a  Create [`DatasetInfo`] from the JSON file in `dataset_info_dir`.

        This function updates all the dynamically generated fields (num_examples,
        hash, time of creation,...) of the [`DatasetInfo`].

        This will overwrite all previous metadata.

        Args:
            dataset_info_dir (`str`):
                The directory containing the metadata file. This
                should be the root directory of a specific dataset version.
            fs (`fsspec.spec.AbstractFileSystem`, *optional*):
                Instance of the remote filesystem used to download the files from.

                <Deprecated version="2.9.0">

                `fs` was deprecated in version 2.9.0 and will be removed in 3.0.0.
                Please use `storage_options` instead, e.g. `storage_options=fs.storage_options`.

                </Deprecated>

            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.

                <Added version="2.9.0"/>

        Example:

        ```py
        >>> from datasets import DatasetInfo
        >>> ds_info = DatasetInfo.from_directory("/path/to/directory/")
        ```
        ra   rd   rb   r   zLoading Dataset info from zECalling DatasetInfo.from_directory() with undefined dataset_info_dir.rr~   encoding)rg   rh   ri   rc   rj   rk   loggerr   
ValueErrorr   rl   rm   rn   ro   rp   r   rq   r   loadr.   )	rD   ru   rv   rc   rw   rx   ry   r8   dataset_info_dictr    r    r!   from_directory0  s     %
zDatasetInfo.from_directory)r   r3   c                    s0   dd t | D  | f  fdd| D S )Nc                 S   s   h | ]
}|j qS r    r4   r6   r    r    r!   r9   m  s     z(DatasetInfo.from_dict.<locals>.<setcomp>c                    s   i | ]\}}| kr||qS r    r    r:   r=   r    r!   r?   n  s       z)DatasetInfo.from_dict.<locals>.<dictcomp>r@   )rD   r   r    r=   r!   r.   k  s    zDatasetInfo.from_dictT)other_dataset_infoc                    s*   | j }|jf  fdd|j  D  d S )Nc                    s(   i | ] \}}|d k	s s|t |qS r,   r   deepcopyr:   ignore_noner    r!   r?   s  s
     z&DatasetInfo.update.<locals>.<dictcomp>)__dict__updaterC   )r0   r   r   Z	self_dictr    r   r!   r   p  s    
zDatasetInfo.updater3   c                 C   s   | j f dd | j D S )Nc                 S   s   i | ]\}}|t |qS r    r   r:   r    r    r!   r?   {  s      z$DatasetInfo.copy.<locals>.<dictcomp>)	__class__r   rC   r/   r    r    r!   r   z  s    zDatasetInfo.copyc                 C   sf   i }t | }|D ]P}|| jkrt| |}t|dr@| ||< qt|drX| ||< q|||< q|S )N_to_yaml_list_to_yaml_string)r   r[   getattrhasattrr   r   )r0   Z	yaml_dictr   r#   r$   r    r    r!   _to_yaml_dict}  s    




zDatasetInfo._to_yaml_dict)	yaml_datar3   c                    sz   t |}|dd k	r*t|d |d< |dd k	rJt|d |d< dd t| D  | f  fdd| D S )Nr*   rU   c                 S   s   h | ]
}|j qS r    r4   r6   r    r    r!   r9     s     z.DatasetInfo._from_yaml_dict.<locals>.<setcomp>c                    s   i | ]\}}| kr||qS r    r    r:   r=   r    r!   r?     s       z/DatasetInfo._from_yaml_dict.<locals>.<dictcomp>)	r   r   getr   Z_from_yaml_listr   rA   rB   rC   )rD   r   r    r=   r!   _from_yaml_dict  s    
zDatasetInfo._from_yaml_dict)Fra   N)F)ra   N)T)2r   r   r   r'   rA   fieldr   rJ   r   rK   rL   rM   r*   r   r   rN   r)   rO   r   rP   r   r   rQ   rR   rS   rT   r   r   rU   rE   rV   rW   intrX   rY   rZ   r[   r   r1   rz   rr   rt   rF   r   r   r.   r   r   r   r   r    r    r    r!   rG   ]   sf   
/*     5
    :
rG   c                   @   sR   e Zd ZdddddZed dddZeed dd	d
ZeddddZdS )DatasetInfosDictFNr   c              	   C   s   i }t j|tj}t j|d}|s0| |}||  t j|rt|ddd0}dd |	 D }t
j|||rxdnd d W 5 Q R X t j|rt|}	|	j}
n
d }	t }
|r||
 |	d krtd	t|
 d
 n|	}	|	t| d S )N	README.mdwr~   r   c                 S   s   i | ]\}}|t |qS r    )r   r7   rS   r   r    r    r!   r?     s     z7DatasetInfosDict.write_to_directory.<locals>.<dictcomp>r{   r|   z---
z
---
)rl   rm   rn   r   DATASETDICT_INFOS_FILENAMEr   r   existsrp   rC   r   dumpr	   r   datar
   to_dataset_card_datar   saver   )r0   dataset_infos_dir	overwriterf   total_dataset_infosZdataset_infos_pathZdataset_readme_pathr8   Zdataset_infos_dictZdataset_carddataset_card_datar    r    r!   rz     s,    

"

z#DatasetInfosDict.write_to_directoryc              
   C   s   t d|  tjtj|drLtt|d j	}d|krL| 
|S tjtj|tjrttj|tjdd*}| dd t| D W  5 Q R  S Q R X n|  S d S )NzLoading Dataset Infos from r   dataset_infor~   r   c                 S   s   i | ]\}}|t |qS r    )rG   r.   )r7   rS   r   r    r    r!   r?     s    z3DatasetInfosDict.from_directory.<locals>.<dictcomp>)r   r   rl   rm   r   rn   r	   r   r   r   from_dataset_card_datar   r   rp   r   rC   )rD   r   r   r8   r    r    r!   r     s    
zDatasetInfosDict.from_directory)r   r3   c                 C   sr   t |dttfrht |d tr8| dd |d D S t|d }|d dd|_| |j|iS n|  S d S )Nr   c                 S   s    i | ]}| d dt|qS )rS   default)r   rG   r   )r7   dataset_info_yaml_dictr    r    r!   r?     s
   
 z;DatasetInfosDict.from_dataset_card_data.<locals>.<dictcomp>rS   r   )r-   r   r`   rE   rG   r   rS   )rD   r   r   r    r    r!   r     s    	z'DatasetInfosDict.from_dataset_card_datac                 C   s*  | r&d|kr6t |d tr6|d dd|d i}n.d|kr`t |d tr`dd |d D }ni }|dd |  D }| D ]\}}||d< qt|dkrtt| |d< |d 	dd }|dkrd|i|d |d< nDg |d< t
| D ].\}}|	dd  d|i|}|d | qd S )Nr   rS   r   c                 S   s   i | ]}|d  |qS )rS   r    )r7   Zconfig_metadatar    r    r!   r?     s    z9DatasetInfosDict.to_dataset_card_data.<locals>.<dictcomp>c                 S   s   i | ]\}}||  qS r    )r   r   r    r    r!   r?     s      r   )r-   rE   r   r`   rC   r   nextitervaluespopsortedappend)r0   r   Zdataset_metadata_infosr   rS   Zdset_info_yaml_dictr   r    r    r!   r     s:     
 
z%DatasetInfosDict.to_dataset_card_data)FF)	r   r   r   rz   rF   r   r
   r   r   r    r    r    r!   r     s   r   c                   @   s  e Zd ZU dZeed< eed< eed< ejedZ	eed< ejedZ
eed< ejedZeed< ejedZee ed	< ejedZee ed
< dZeed< dZee ed< dZee ed< dZee ed< dZee ed< dd ZdddZed dddZeed dddZdS )
MetricInfoa  Information about a metric.

    `MetricInfo` documents a metric, including its name, version, and features.
    See the constructor arguments and properties for a full list.

    Note: Not all fields are known on construction and may be updated later.
    rJ   rK   r*   rH   inputs_descriptionrL   rM   codebase_urlsreference_urlsF
streamableNformatmetric_namerS   experiment_idc                 C   sD   | j d k	r@| j D ]*\}}t|tstd| d|jj qd S )NzSWhen using 'numpy' format, all features should be a `datasets.Value` feature. Here z is an instance of )r   r*   rC   r-   r   r   r   r   )r0   r#   r$   r    r    r!   r1     s    

zMetricInfo.__post_init__c              	   C   s~   t tj|tjddd"}tjt| ||r0dndd W 5 Q R X | j	rzt tj|tj
ddd}|| j	 W 5 Q R X dS )a  Write `MetricInfo` as JSON to `metric_info_dir`.
        Also save the license separately in LICENCE.
        If `pretty_print` is True, the JSON will be pretty-printed with the indent level of 4.

        Example:

        ```py
        >>> from datasets import load_metric
        >>> metric = load_metric("accuracy")
        >>> metric.info.write_to_directory("/path/to/directory/")
        ```
        r   r~   r   r{   Nr|   )rp   rl   rm   rn   r   METRIC_INFO_FILENAMEr   r   r   rM   rs   r   )r0   metric_info_dirrf   r8   r    r    r!   rz   '  s
    &zMetricInfo.write_to_directoryr   c              	   C   sT   t d|  |stdttj|tjdd}t	
|}W 5 Q R X | |S )a  Create MetricInfo from the JSON file in `metric_info_dir`.

        Args:
            metric_info_dir: `str` The directory containing the metadata file. This
                should be the root directory of a specific dataset version.

        Example:

        ```py
        >>> from datasets import MetricInfo
        >>> metric_info = MetricInfo.from_directory("/path/to/directory/")
        ```
        zLoading Metric info from zCCalling MetricInfo.from_directory() with undefined metric_info_dir.r~   r   )r   r   r   rp   rl   rm   rn   r   r   r   r   r.   )rD   r   r8   metric_info_dictr    r    r!   r   ;  s    zMetricInfo.from_directory)r   r3   c                    s0   dd t | D  | f  fdd| D S )Nc                 S   s   h | ]
}|j qS r    r4   r6   r    r    r!   r9   T  s     z'MetricInfo.from_dict.<locals>.<setcomp>c                    s   i | ]\}}| kr||qS r    r    r:   r=   r    r!   r?   U  s       z(MetricInfo.from_dict.<locals>.<dictcomp>r@   )rD   r   r    r=   r!   r.   R  s    zMetricInfo.from_dict)F)r   r   r   r'   r   r   r   rA   r   r   rL   rM   r`   r   r   r   r   boolr   r   r   rS   r   r1   rz   rF   r   rE   r.   r    r    r    r!   r     s(   
		
r   )3r'   r   rA   r   rl   ro   rg   r   pathlibr   typingr   r   r   r   r   rj   Zhuggingface_hubr	   r
   r   r   r*   r   r   Zfilesystemsr   rU   r   tasksr   r   utilsr   Zutils.loggingr   Zutils.py_utilsr   r   r   r   r   r"   	Exceptionr%   r(   r)   rG   r   r   r   r    r    r    r!   <module>   sF     :m