U
    -ez                    @   s  d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
mZ ddlmZmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZ ddlZddlZddlmZmZm Z  dd	l!m"Z" dd
l#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z; ddl<m=Z= ddl>m?Z?m@Z@ ddlAmBZB ddlCmDZDmEZE ddlFmGZG ddlHmIZI ddlJmKZKmLZL ddlMmNZNmOZOmPZPmQZQmRZR ddlSmTZT ddlUmVZV ddlWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z` ddlambZb ddlcmdZd ddlemfZfmgZg ddlhmiZi dd ljmkZk dd!llmmZm dd"lnmoZo eiepZqereNs d#g Zte"judfeveeeevf  d$d%d&Zwdeeee' eeI f  d(d)d*ZxG d+d, d,Zyee' ee& eev evee' d-d.d/Zzdd0eev ee' d1d2d3Z{eev evd4d5d6Z|evevd7d8d9Z}eveveevevevevf ee4 eeevevf  d:d;d<Z~eveveveveeevevf  eeevevf  eee6evf  evd=d>d?Zeveeevevf  eeevevf  eveveve6eevevf d@dAdBZde+ee4 eeevevf  dCdDdEZde+ee4 eeevevf  dCdFdGZde*eev ee4 eeev eevef f dHdIdJZeveevdKdLdMZdevekeeev eevef ee4 eee& evf dNdOdPZeG dQdR dRZeG dSd0 d0ZeG dTdU dUZG dVdW dWZG dXdY dYZG dZd[ d[eZG d\d] d]eZG d^d_ d_eZG d`da daeZG dbdc dceZG ddde deeZG dfdg dgeZG dhdi dieZG djdk dkeZdeveeeveof  ee4 eee6evf  eev eev eeeeeve*f  edldmdnZeVdodeveeeveof  ee4 eee6evf  eev edpdqdrZeVdsdeveev eeeev eev eee4 eee6evf  eeeveof  eIdudvdwZdeveev eev eeeveev eeveeveev f f f  eev ee= ee4 eee6evf  eeeveof  eeeevf  ee e'dydzd{Zdeveev eev eeeveev eeveeveev f f f  eeeveTf  eev ee= ee4 eee6evf  eeefevf  ee eeeeveof  eeeevf  eee ee ee1e$e2eGf d|d}d~Zdevee ee ee$e1f dddZdS )zAccess datasets.    N)Counter)	dataclassfield)Path)	AnyDictListMappingOptionalSequenceTupleTypeUnion)DatasetCardDatasetCardDataHfApi   )config)Dataset)BuilderConfigDatasetBuilder)DEFAULT_PATTERNS_ALLDataFilesDictDataFilesListEmptyDatasetErrorget_data_patternsget_metadata_patternssanitize_patterns)DatasetDictIterableDatasetDict)DownloadConfig)DownloadMode)StreamingDownloadManager	xbasenamexglobxjoin)Features)extract_path_from_uriis_remote_filesystem)Hasher)DatasetInfoDatasetInfosDict)IterableDataset)Metric)camelcase_to_snakecasesnakecase_to_camelcase)_EXTENSION_TO_MODULE_MODULE_SUPPORTS_METADATA_MODULE_TO_EXTENSIONS_PACKAGED_DATASETS_MODULES_hash_python_lines)Split)
deprecated)	OfflineModeIsEnabled!_raise_if_offline_mode_is_enabledcached_path
head_hf_s3hf_github_urlinit_hf_modulesis_relative_pathrelative_to_absolute_pathurl_or_path_join)FileLock)
hf_hub_url)VerificationModeis_small_dataset)
get_logger)MetadataConfigs)get_imports)Version.zip)namehf_modules_cachec              	   C   s^   t |}tj|| }tj|dd tjtj|dsZttj|dd W 5 Q R X |S )a^  
    Create a module with name `name` in which you can add dynamic modules
    such as metrics or datasets. The module can be imported using its name.
    The module is created in the HF_MODULE_CACHE directory by default (~/.cache/huggingface/modules) but it can
    be overridden by specifying a path to another directory in `hf_modules_cache`.
    Texist_ok__init__.pyw)r<   ospathjoinmakedirsexistsopen)rI   rJ   dynamic_modules_path rV   N/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/datasets/load.pyinit_dynamic_modules\   s    	
rX   Treturnc                 C   sv   t | }|rt}nt}d}|j D ]J\}}t|r&t||r&t	|rNq&|}t
|}|dk	r&||kr& qrq&|S )zImport a module at module_path and return its main class:
    - a DatasetBuilder if dataset is True
    - a Metric if dataset is False
    N)	importlibimport_moduler   r-   __dict__itemsinspectisclass
issubclass
isabstract	getmodule)module_pathdatasetmoduleZmain_cls_typeZmodule_main_clsrI   objZ
obj_modulerV   rV   rW   import_main_classn   s    


rh   c                   @   s   e Zd ZdZdd ZdS )#_InitializeConfiguredDatasetBuilderaL  
    From https://stackoverflow.com/questions/4647566/pickle-a-dynamically-parameterized-sub-class
    See also ConfiguredDatasetBuilder.__reduce__
    When called with the param value as the only argument, returns an
    un-initialized instance of the parameterized class. Subsequent __setstate__
    will be called by pickle.
    c                 C   s   t  }t||||d|_|S )N)default_config_namedataset_name)ri   configure_builder_class	__class__)selfbuilder_clsmetadata_configsrj   rI   rg   rV   rV   rW   __call__   s       z,_InitializeConfiguredDatasetBuilder.__call__N)__name__
__module____qualname____doc__rq   rV   rV   rV   rW   ri      s   ri   )ro   builder_configsrj   rk   rZ   c                    sT   G  fddd } j    t| |_  j    t| |_|S )z
    Dynamically create a builder class with custom builder configs parsed from README.md file,
    i.e. set BUILDER_CONFIGS class variable of a builder class to custom configs list.
    c                       s"   e Zd ZZZ jZdd ZdS )z9configure_builder_class.<locals>.ConfiguredDatasetBuilderc                 S   s,   | j jd }t || j| j| jf| j fS )Nr   )rm   __mro__ri   BUILDER_CONFIGSDEFAULT_CONFIG_NAMErk   r]   copy)rn   Zparent_builder_clsrV   rV   rW   
__reduce__   s    zDconfigure_builder_class.<locals>.ConfiguredDatasetBuilder.__reduce__N)rr   rs   rt   rx   ry   r{   rV   ro   rv   rj   rV   rW   ConfiguredDatasetBuilder   s   r}   )rr   lower
capitalizer/   rt   )ro   rv   rj   rk   r}   rV   r|   rW   rl      s    rl   DatasetModule)dataset_modulerk   rZ   c                 C   s.   t | j}| jjr*t|| jj| jj|d}|S )N)rv   rj   rk   )rh   rd   builder_configs_parametersrv   rl   rj   )r   rk   ro   rV   rV   rW   get_dataset_builder_class   s    
r   )
file_pathsrZ   c              
   C   sz   g }| D ]4}t j|r2|tt|d q|| qg }|D ]*}t|dd}||	  W 5 Q R X qFt
|S )zt
    Convert a list of scripts or text files provided in file_paths into a hashed filename in a repeatable way.
    z
*.[pP][yY]utf-8encoding)rO   rP   isdirextendlistr   rglobappendrT   	readlinesr4   )r   Zto_use_files	file_pathlinesfrV   rV   rW   files_to_hash   s    r   rI   resource_typec                 C   s@   t js<t jr<zt| | d |dkd W n tk
r:   Y nX dS )z1Update the download count of a dataset or metric..pyre   )filenamere   N)r   HF_DATASETS_OFFLINEZHF_UPDATE_DOWNLOAD_COUNTSr:   	Exceptionr   rV   rV   rW   increase_load_count   s
    r   )rI   	base_pathimportsdownload_configrZ   c                 C   s  g }g }|  }|jdkr d|_|D ]\}}}}	|dkrH|||f q$|| krrtd|  d| d| d| d	|d	krt||d
 }
n|dkr|}
ntdt|
|d}|	dk	rtj||	}|||f q$i }|D ]J\}}zt	
|}W q tk
r"   ||ks||kr|||< Y qX q|rt|dkr>dnd}t|dkrTdnd}d| krnd|d< td|  d| dd| d| dd|  d|S )a  
    Download additional module for a module <name>.py at URL (or local path) <base_path>/<name>.py
    The imports must have been parsed first using ``get_imports``.

    If some modules need to be installed with pip, an error is raised showing how to install them.
    This function return the list of downloaded modules as tuples (import_name, module_file_path).

    The downloaded modules can then be moved into an importable directory with ``_copy_script_and_other_resources_in_importable_dir``.
    NzDownloading extra modulesZlibraryzError in the z script, importing relative z module but z: is the name of the script. Please change relative import zl to another name and add a '# From: URL_OR_PATH' comment pointing to the original relative import file path.Zinternalr   ZexternalzWrong import_typer   r   dependencies
dependencyZthemitZsklearnzscikit-learnzTo be able to use z$, you need to install the following : z, z.
Please install z using 'pip install  z' for instance.)rz   download_descr   
ValueErrorr?   r9   rO   rP   rQ   r[   r\   ImportErrorlenkeysvalues)rI   r   r   r   local_importsZlibrary_importsZimport_typeimport_nameimport_pathZsub_directoryZurl_or_filenameZlocal_import_pathZneeds_to_be_installedZlibrary_import_nameZlibrary_import_pathlibZ_dependencies_strZ	_them_strrV   rV   rW   _download_additional_modules   sR    
2r   )rI   importable_directory_pathsubdirectory_nameoriginal_local_pathr   additional_filesdownload_moderZ   c              
   C   s,  t j||}t j|| d }|d }	t|	 |tjkrTt j|rTt| t j	|dd t j|d}
t j|
st
|
d W 5 Q R X t j	|dd t j|d}
t j|
st
|
d W 5 Q R X t j|st|| t j|d d }t j|s:||d	}t
|dd
d}t|| W 5 Q R X |D ]\}}t j|rt j||d }t j|st|| nFt j|rt j||}t j|st|| ntd| q>|D ]@\}}t j||}t j|rt||st|| q|W  5 Q R  S Q R X dS )a  Copy a script and its required imports to an importable directory

    Args:
        name (str): name of the resource to load
        importable_directory_path (str): path to the loadable folder in the dynamic modules directory
        subdirectory_name (str): name of the subdirectory in importable_directory_path in which to place the script
        original_local_path (str): local path to the resource script
        local_imports (List[Tuple[str, str]]): list of (destination_filename, import_file_to_copy)
        additional_files (List[Tuple[str, str]]): list of (destination_filename, additional_file_to_copy)
        download_mode (Optional[Union[DownloadMode, str]]): download mode

    Return:
        importable_local_file: path to an importable module with importlib.import_module
    r   z.lockTrK   rM   rN   r   z.json)zoriginal file pathzlocal file pathr   r   zError with local import at N)rO   rP   rQ   r@   r!   FORCE_REDOWNLOADrS   shutilrmtreerR   rT   copyfilesplitextjsondumpisfiler   copytreer   filecmpcmp)rI   r   r   r   r   r   r   Zimportable_subdirectoryimportable_local_fileZ	lock_pathZinit_file_path	meta_pathmetaZ	meta_filer   r   Zfull_path_local_import	file_nameZoriginal_pathZdestination_additional_pathrV   rV   rW   2_copy_script_and_other_resources_in_importable_dir.  sR    



 r   )
local_pathr   r   rU   module_namespacerI   r   rZ   c              	   C   s   t j|||dd}t|jddd t|jd jdd t| gdd |D  }t	|
dd	 ||| |||d
}	td|	  dt j|||dd||
dd	 g}
|
|fS )N/--T)parentsrL   rM   rK   c                 S   s   g | ]}|d  qS )r   rV   ).0locrV   rV   rW   
<listcomp>  s     z+_create_importable_file.<locals>.<listcomp>)rI   r   r   r   r   r   r   z#Created importable dataset file at .)rO   rP   rQ   replacer   mkdirparenttouchr   r   splitloggerdebugbasename)r   r   r   rU   r   rI   r   r   hashr   rd   rV   rV   rW   _create_importable_file  s$    		&r   )data_files_listr   rZ   c                 C   s   t dd | dtj D }|rtttf tttf ddd}t| |ddD ]4\}}|t	krnt	|   S |d	krRt
| |d
  S qRdi fS )a*  Infer module (and builder kwargs) from list of data files.

    It picks the module based on the most common file extension.
    In case of a draw ".parquet" is the favorite, and then alphabetical order.

    Args:
        data_files_list (DataFilesList): List of data files.
        download_config (bool or str, optional): mainly use use_auth_token or storage_options to support different platforms and auth types.

    Returns:
        tuple[str, dict[str, Any]]: Tuple with
            - inferred module name
            - dict of builder kwargs
    c                 s   s6   | ].}t |d dd D ]}d |  V  qqdS r   r   Nr#   r   r~   r   filepathsuffixrV   rV   rW   	<genexpr>  s    z3infer_module_for_data_files_list.<locals>.<genexpr>N)	ext_countrZ   c                 S   s   | \}}||dk|fS )zBSort by count and set ".parquet" as the favorite in case of a drawz.parquetrV   )r   extcountrV   rV   rW   sort_key  s    z2infer_module_for_data_files_list.<locals>.sort_keyT)keyreverserH   r   )r   r   Z*DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCEr   strintboolsortedr^   r0   ,infer_module_for_data_files_list_in_archives)r   r   extensions_counterr   r   _rV   rV   rW    infer_module_for_data_files_list  s     r   c                 C   s   g }d}| D ]`}t |dr|d7 }|tjkr4 qntt |d}|dd t|d|dd	tj D 7 }qt	d
d |D }|r|
dd d }|tkrt| S d	i fS )a  Infer module (and builder kwargs) from list of archive data files.

    Args:
        data_files_list (DataFilesList): List of data files.
        download_config (bool or str, optional): mainly use use_auth_token or storage_options to support different platforms and auth types.

    Returns:
        tuple[str, dict[str, Any]]: Tuple with
            - inferred module name
            - dict of builder kwargs
    r   rH   r   z**c                 S   s   g | ]}| d d qS )z::r   r   )r   r   rV   rV   rW   r     s   z@infer_module_for_data_files_list_in_archives.<locals>.<listcomp>T)	recursiver   Nc                 s   s6   | ].}t |d dd D ]}d |  V  qqdS r   r   r   rV   rV   rW   r     s      z?infer_module_for_data_files_list_in_archives.<locals>.<genexpr>)r   endswithr   Z2GLOBBED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCEr%   r"   extractr$   Z3ARCHIVED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCEr   most_commonr0   )r   r   Zarchived_filesZarchive_files_counterr   Z	extractedr   r   rV   rV   rW   r     s*    

r   )
data_filesrP   r   rZ   c                    s   fdd|   D }tt| \ t fdd| D rTtd| sz|rhd| dnd}td|  fS )	a  Infer module (and builder kwargs) from data files. Raise if module names for different splits don't match.

    Args:
        data_files (DataFilesDict): List of data files.
        path (str, optional): Dataset name or path.
        DownloadConfig (bool or str, optional): for authenticate on the Hugging Face Hub for private remote files.

    Returns:
        tuple[str, dict[str, Any]]: Tuple with
            - inferred module name
            - builder kwargs
    c                    s   i | ]\}}|t | d qS )r   )r   r   r   r   r   rV   rW   
<dictcomp>  s    z/infer_module_for_data_files.<locals>.<dictcomp>c                 3   s   | ]} f|kV  qd S NrV   )r   Zsplit_module)default_builder_kwargsmodule_namerV   rW   r      s     z.infer_module_for_data_files.<locals>.<genexpr>z=Couldn't infer the same data file format for all splits. Got z in z. z1No (supported) data files or dataset script found)r^   nextiterr   anyr   FileNotFoundError)r   rP   r   Zsplit_modulesrV   )r   r   r   rW   infer_module_for_data_files  s    
r   )r   config_parametersrZ   c                    sF   dddh  fddt | D }t }||  || | S )z
    Used to update hash of packaged modules which is used for creating unique cache directories to reflect
    different config parameters which are passed in metadata from readme.
    config_nameversiondescriptionc                    s   i | ]\}}| kr||qS rV   rV   r   paramvalueZparams_to_excluderV   rW   r     s      z6update_hash_with_config_parameters.<locals>.<dictcomp>)r   r^   r)   update	hexdigest)r   r   Zparams_to_add_to_hashmrV   r  rW   "update_hash_with_config_parameters  s    




r  )rd   rp   supports_metadatar   r   r   rZ   c                    s  t | }|j | }g }|d k	r&|nd}| D ]p\}	}
|
d}|
d}|r`|d | n|}z.|d k	rvt|nt|}tj||t	|d}W n: t
k
r } zt
d| d|	 d|W 5 d }~X Y nX |d krH|rH|tkrHzt|}W n tk
r   d }Y nX |d k	rHtj||d	rHtfd
d| D } fdd|
D }|rrtd| d | f |	||d fdd||
 D  q2||fS )N r   data_dirr   r   allowed_extensionsr   zDataset at 'z?' doesn't contain data files matching the patterns for config 'zZ', check `data_files` and `data_fir` parameters in the `configs` YAML field in README.md. r   c                    s   i | ]\}}||  qS rV   rV   r   )config_metadata_data_files_listrV   rW   r   E  s    z@create_builder_configs_from_metadata_configs.<locals>.<dictcomp>c                    s"   g | ]}t  |s|d kr|qS )defaulthasattr)r   r   builder_config_clsrV   rW   r   J  s    
  z@create_builder_configs_from_metadata_configs.<locals>.<listcomp>z#Some datasets params were ignored: zx. Make sure to use only valid params for the dataset builder and to have a up-to-date version of the `datasets` library.)rI   r   r	  c                    s(   i | ] \}}t  |r|d kr||qS ))r  r   r	  r  r   r  rV   rW   r   X  s
   
  )rh   ZBUILDER_CONFIG_CLASSZget_default_config_namer^   getr   r   r   from_patternsALL_ALLOWED_EXTENSIONSr   r   r   r   r   r   warningr   )rd   rp   r  r   r   r   ro   rj   rv   r   Zconfig_paramsZconfig_data_filesZconfig_data_dirZconfig_base_pathZconfig_patternsZconfig_data_files_dicteZconfig_metadata_patternsZignored_paramsrV   )r  r  rW   ,create_builder_configs_from_metadata_configs  sz    





 




r  c                   @   sF   e Zd ZU dZdZee ed< dZee	e
  ed< dZee ed< dS )BuilderConfigsParametersa  Dataclass containing objects related to creation of builder configurations from yaml's metadata content.

    Attributes:
        metadata_configs (`MetadataConfigs`, *optional*):
            Configs parsed from yaml's metadata.
        builder_configs (`list[BuilderConfig]`, *optional*):
            List of BuilderConfig objects created from metadata_configs above.
        default_config_name (`str`):
            Name of default config taken from yaml's metadata.
    Nrp   rv   rj   )rr   rs   rt   ru   rp   r
   rE   __annotations__rv   r   r   rj   r   rV   rV   rV   rW   r  b  s   
r  c                   @   sH   e Zd ZU eed< eed< eed< eedZeed< dZ	e
e ed< dS )r   rd   r   builder_kwargs)default_factoryr   Ndataset_infos)rr   rs   rt   r   r  dictr   r  r   r  r
   r+   rV   rV   rV   rW   r   t  s
   
c                   @   s   e Zd ZU eed< eed< dS )MetricModulerd   r   N)rr   rs   rt   r   r  rV   rV   rV   rW   r  }  s   
r  c                   @   s   e Zd ZedddZdS )_DatasetModuleFactoryrY   c                 C   s   t d S r   NotImplementedErrorrn   rV   rV   rW   
get_module  s    z _DatasetModuleFactory.get_moduleN)rr   rs   rt   r   r$  rV   rV   rV   rW   r     s   r   c                   @   s   e Zd ZedddZdS )_MetricModuleFactoryrY   c                 C   s   t d S r   r!  r#  rV   rV   rW   r$    s    z_MetricModuleFactory.get_moduleN)rr   rs   rt   r  r$  rV   rV   rV   rW   r%    s   r%  c                	   @   sr   e Zd ZdZeddeeeeef  ee	 eee
ef  ee dddZee eddd	Zed
ddZdS )GithubMetricModuleFactoryu   Get the module of a metric. The metric script is downloaded from GitHub.

    <Deprecated version="2.5.0">

    Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate

    </Deprecated>
    O   Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluateNrI   revisionr   r   rU   c                 C   sd   || _ || _|r| nt | _| jjdk r4d| j_|| _|| _| j ddksTt	t
|dd d S )N   r   r   metricr   )rI   r)  rz   r    r   max_retriesr   rU   r   AssertionErrorr   rn   rI   r)  r   r   rU   rV   rV   rW   __init__  s    	z"GithubMetricModuleFactory.__init__)r)  rZ   c                 C   s>   t | j| jd |dd}| j }|jd kr2d|_t||dS )Nr   FrP   rI   r)  re   Downloading builder scriptr   )r;   rI   r   rz   r   r9   )rn   r)  r   r   rV   rV   rW   download_loading_script  s
    

z1GithubMetricModuleFactory.download_loading_scriptrY   c              	   C   s   | j }z| |}| j }W nB tk
r\   |d k	r6 n"d}| |}td| j d Y nX t|}t| jt| jd|dd|| j	d}| j
r| j
nt }t||g |d| j| jd	\}}t  t||S )
Nmainz-Couldn't find a directory or a metric named 'zH' in this version. It was picked from the main branch on github instead.r  Fr1  rI   r   r   r   metricsr   r   r   rU   r   rI   r   )r)  r3  r   r   r  rI   rF   r   r;   r   rU   rX   r   r   r[   invalidate_cachesr  )rn   r)  r   r   r   rU   rd   r   rV   rV   rW   r$    s>    





z$GithubMetricModuleFactory.get_module)NNNN)rr   rs   rt   ru   r6   r   r
   r   rG   r    r!   r0  r3  r  r$  rV   rV   rV   rW   r&    s   	    r&  c                   @   sP   e Zd ZdZedd
eee eee	ef  ee dddZ
eddd	ZdS )LocalMetricModuleFactoryu   Get the module of a local metric. The metric script is loaded from a local script.

    <Deprecated version="2.5.0">

    Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate

    </Deprecated>
    r'  NrP   r   r   rU   c                 C   s.   || _ t|j| _|pt | _|| _|| _d S r   rP   r   stemrI   r    r   r   rU   rn   rP   r   r   rU   rV   rV   rW   r0    s
    z!LocalMetricModuleFactory.__init__rY   c              	   C   sn   t | j}t| jtt| jj|| jd}| jr6| jnt	 }t
| j|g |d| j| jd\}}t  t||S )Nr5  r6  r7  )rF   rP   r   rI   r   r   r   r   rU   rX   r   r   r[   r8  r  )rn   r   r   rU   rd   r   rV   rV   rW   r$    s&    


z#LocalMetricModuleFactory.get_module)NNN)rr   rs   rt   ru   r6   r   r
   r    r   r!   r0  r  r$  rV   rV   rV   rW   r9    s   	   r9  c                   @   sH   e Zd ZdZd	eee eeeef  ee dddZ	e
dddZdS )
#LocalDatasetModuleFactoryWithScriptzTGet the module of a local dataset. The dataset script is loaded from a local script.Nr:  c                 C   s.   || _ t|j| _|pt | _|| _|| _d S r   r;  r=  rV   rV   rW   r0    s
    z,LocalDatasetModuleFactoryWithScript.__init__rY   c           
   	   C   s   t | jjtj }t | jjd }t| j}t| jtt | jj|| j	d}g }|
 rl|tjt|f |
 r|d|f | jr| jnt }t| j|||d| j| jd\}}t  |tt | jjd}	t|||	S )N	README.mdr5  datasetsr7  )r   r   )r   rP   r   r   DATASETDICT_INFOS_FILENAMErF   r   rI   r   r   is_filer   rU   rX   r   r   r[   r8  r   )
rn   dataset_infos_pathdataset_readme_pathr   r   r   rU   rd   r   r  rV   rV   rW   r$    s6    


z.LocalDatasetModuleFactoryWithScript.get_module)NNN)rr   rs   rt   ru   r   r
   r    r   r!   r0  r   r$  rV   rV   rV   rW   r>  	  s      r>  c                   @   sR   e Zd ZdZd	eee eeeeef  eee	ef  dddZ
edddZdS )
&LocalDatasetModuleFactoryWithoutScriptzGet the module of a dataset loaded from the user's data files. The dataset builder module to use is inferred
    from the data files extensions.N)rP   r	  r   r   c                 C   sN   |rt j|rtd| t| | _t|j| _|| _|| _	|| _
d S )Nz;`data_dir` must be relative to a dataset directory's root: )rO   rP   isabsr   r   as_posixr<  rI   r   r	  r   )rn   rP   r	  r   r   rV   rV   rW   r0  >  s    z/LocalDatasetModuleFactoryWithoutScript.__init__rY   c              
      s  t j| jd}t j|r(t|jnt }t	|}t
	|}t| j| jpPd   }| jd k	rvt| j}n:|rdtt| krttt| d }nt|}tj||td}t|| jd\}}	|t| }|tk}
| jd krd|
rd|tkrdzt|}W n tk
r*   d }Y nX |d k	rdt j||d  rdt fdd|! D }t"| \}}|rt#||||
|	d	\}}nd
\}}|| jt$t| jj%d}| jd k	s|s||d< |&|	 t jt j| jt'j(rht)t j| jt'j(ddJ}t
dd t*|! D }t+|dkrPtt|}|,||d< W 5 Q R X |&| |}|d krt+|dkrtt|}t-||||t.|||ddS )Nr?  r  r   )r   r  )r   rP   r  c                    s   i | ]\}}||  qS rV   rV   r   Zmetadata_data_files_listrV   rW   r   q  s    zELocalDatasetModuleFactoryWithoutScript.get_module.<locals>.<dictcomp>)r   r  r   NN)r   r   rk   r   r   c                 S   s   i | ]\}}|t |qS rV   r*   	from_dictr   r   Zdataset_info_dictrV   rV   rW   r     s    r   r  rp   rv   rj   r  r   )/rO   rP   rQ   r   r   loaddatar   rE   from_dataset_card_datar+   r   r	  
expanduserresolverG  r   r   r   r   r   r   r   r  r  r   filter_extensionsr2   r1   r   r   r   r   r^   r3   r  r.   rI   r  r   rA  rT   r   r   popr   r  )rn   Zreadme_pathdataset_card_datarp   r  r   patternsr   r   r   r  metadata_patternsrd   r   rv   rj   r  r   legacy_dataset_infoslegacy_config_namerV   rH  rW   r$  N  s    








z1LocalDatasetModuleFactoryWithoutScript.get_module)NNN)rr   rs   rt   ru   r   r
   r   r   r   r!   r0  r   r$  rV   rV   rV   rW   rE  :  s      rE  c                	   @   sX   e Zd ZdZd	eee eeeeef  ee	 eee
ef  dddZedddZdS )
PackagedDatasetModuleFactoryz`Get the dataset builder module from the ones that are packaged with the library: csv, json, etc.N)rI   r	  r   r   r   c                 C   s.   || _ || _|| _|| _|| _t|dd d S Nre   r,  )rI   r   r	  r   r   r   )rn   rI   r	  r   r   r   rV   rV   rW   r0    s    z%PackagedDatasetModuleFactory.__init__rY   c           	         s   t | jp
d   }| jd k	r.t| jnt|}tj	|| j
|d}| jtk}| jd kr|r|tkrzt|}W n tk
r   d }Y nX |d k	rtj	|| j
|d  rt fdd| D }t| j \}}||| jd}t|||S )Nr  r   r   c                    s   i | ]\}}||  qS rV   rV   r   rH  rV   rW   r     s    z;PackagedDatasetModuleFactory.get_module.<locals>.<dictcomp>)r   r   rk   )r   r	  rR  rS  rG  r   r   r   r   r  r   rI   r1   r   r   r   r   r^   r3   r   )	rn   r   rW  r   r  rX  rd   r   r  rV   rH  rW   r$    s>    

  
z'PackagedDatasetModuleFactory.get_module)NNNN)rr   rs   rt   ru   r   r
   r   r   r   r    r!   r0  r   r$  rV   rV   rV   rW   r[    s       r[  c                
   @   sf   e Zd ZdZd	eeeeef  ee eeeee	f  ee
 eeeef  dddZedddZdS )
$HubDatasetModuleFactoryWithoutScriptz
    Get the module of a dataset loaded from data files of a dataset repository.
    The dataset builder module to use is inferred from the data files extensions.
    N)rI   r)  r	  r   r   r   c                 C   s:   || _ || _|| _|| _|p t | _|| _t|dd d S r\  )rI   r)  r   r	  r    r   r   r   )rn   rI   r)  r	  r   r   r   rV   rV   rW   r0    s    	z-HubDatasetModuleFactoryWithoutScript.__init__rY   c              
      s\  t tjj| j| j| jjdd}|j}d| j d| d| j	p>d 
d}| j }|jd krdd|_z,tt| jd|d	|d
}tt|j}W n tk
r   t }Y nX t|}t|}| jd k	rt| j}	nD|rdtt| krttt| d }	nt|| jd
}	tj|	|t | jd}
t!|
| j| jd\}}|
"t#| }
|t$k}| jd kr|r|	t%krzt&|}W n tk
r   d }Y nX |d k	rt'j|| j|d  rt fdd|
( D }
t)| \}}|rt*|||||| jd\}}nd\}}|t| jd| jd	| jt+t| jjd}| jd k	sL|s^|
|d< |,| | j }|jd krzd|_ztt| jtj-| jd	|d
}t.|ddJ}tdd t/|( D }t0|dkrtt|}|1||d< W 5 Q R X |,| |}W n tk
r   Y nX |d kr@t0|dkr@tt|}t2||||t3|||ddS )N      Y@)r)  tokentimeoutzhf://datasets/@r   r  Downloading readmer?  r)  r   r   r
  )r   rP   r   r]  c                    s   i | ]\}}||  qS rV   rV   r   rH  rV   rW   r   1  s    zCHubDatasetModuleFactoryWithoutScript.get_module.<locals>.<dictcomp>)r   r  r   r   rI  )r   r   repo_idrk   Downloading metadatar   r   c                 S   s   i | ]\}}|t |qS rV   rJ  rL  rV   rV   rW   r   W  s    r   r  rM  rN  )4r   r   HF_ENDPOINTdataset_inforI   r)  r   r`  shar	  rstriprz   r   r9   rA   r   rO  r   rP  r   r   rE   rQ  r+   r   r   r   r   r   r   r   r  r  r   rT  r2   r1   r   r   r   r^   r3   r  r.   r  rA  rT   r   r   rU  r   r  )rn   Zhfh_dataset_infor)  r   r   rD  rV  rp   r  rW  r   r   r   r  rX  rd   r   rv   rj   r  rC  r   rY  rZ  rV   rH  rW   r$    s    
$







  
	


z/HubDatasetModuleFactoryWithoutScript.get_module)NNNNN)rr   rs   rt   ru   r   r
   r   rG   r   r   r    r!   r0  r   r$  rV   rV   rV   rW   r^    s        r^  c                   @   s   e Zd ZdZdeeeeef  ee eee	ef  ee dddZ
edddZedd	d
ZedddZedddZdS )!HubDatasetModuleFactoryWithScriptz
    Get the module of a dataset from a dataset repository.
    The dataset script comes from the script inside the dataset repository.
    Nr(  c                 C   s4   || _ || _|pt | _|| _|| _t|dd d S r\  )rI   r)  r    r   r   rU   r   r/  rV   rV   rW   r0  z  s    z*HubDatasetModuleFactoryWithScript.__init__rY   c                 C   sH   t | j| jdd d | jd}| j }|jd kr<d|_t||dS )Nr   r   r   re  rP   r)  r2  r   )rA   rI   r   r)  r   rz   r   r9   )rn   r   r   rV   rV   rW   r3    s
    "

z9HubDatasetModuleFactoryWithScript.download_loading_scriptc              	   C   s\   t | jtj| jd}| j }|jd kr.d|_zt||dW S  t	t
fk
rV   Y d S X d S )Nrl  rf  r   )rA   rI   r   rA  r)  r   rz   r   r9   r   ConnectionError)rn   r  r   rV   rV   rW   download_dataset_infos_file  s    

z=HubDatasetModuleFactoryWithScript.download_dataset_infos_filec              	   C   sZ   t | jd| jd}| j }|jd kr,d|_zt||dW S  ttfk
rT   Y d S X d S )Nr?  rl  rc  r   )	rA   rI   r)  r   rz   r   r9   r   rm  )rn   Z
readme_urlr   rV   rV   rW   download_dataset_readme_file  s    

z>HubDatasetModuleFactoryWithScript.download_dataset_readme_filec              	   C   s   |   }|  }|  }t|}t| jt| jd| jd|| jd}g }|rZ|	t
j|f |rl|	d|f | jrx| jnt }t||||d| j| jd\}}	t  |	t| jd| jd| jd}
t||	|
S )	Nr  rl  r5  r?  r@  r7  rd  )r   r   re  )r3  rn  ro  rF   r   rI   rA   r)  r   r   r   rA  rU   rX   r   r   r[   r8  r   )rn   r   rC  rD  r   r   r   rU   rd   r   r  rV   rV   rW   r$    s>    

z,HubDatasetModuleFactoryWithScript.get_module)NNNN)rr   rs   rt   ru   r   r
   r   rG   r    r!   r0  r3  rn  ro  r   r$  rV   rV   rV   rW   rk  t  s        rk  c                   @   s4   e Zd ZdZd	eee dddZedddZdS )
CachedDatasetModuleFactoryz
    Get the module of a dataset that has been loaded once already and cached.
    The script that is loaded from the cache is the most recent one with a matching name.
    NrI   rU   c                 C   s$   || _ || _| j ddks td S )Nr   r   rI   rU   r   r.  rn   rI   rU   rV   rV   rW   r0    s    z#CachedDatasetModuleFactory.__init__rY   c              	      s   j rj nt }tj|djdd tj rLdd t D nd }|sjt	dj d|  fdd	}t
||d
d }dtj | dt|| dj d}tjs|d7 }t| dtj|djdd|jdd g}t  |jd}t|||S )Nr@  r   r   c                 S   s   g | ]}t |d kr|qS @   r   r   hrV   rV   rW   r     s      z9CachedDatasetModuleFactory.get_module.<locals>.<listcomp>zDataset  is not cached in c                    s&   t  |  jdd d   jS )Nr   r   r   )r   rI   r   statst_mtimeZmodule_hashr   rn   rV   rW   _get_modification_time  s    zECachedDatasetModuleFactory.get_module.<locals>._get_modification_timer   r   3Using the latest cached version of the module from  (last modified on () since it couldn't be found locally at r   &, or remotely on the Hugging Face Hub.)r   re  )rU   rX   rO   rP   rQ   rI   r   r   listdirr   r   timectimer   r   r   r  r   r   r[   r8  r   )rn   rU   hashesr~  r   Zwarning_msgrd   r  rV   r}  rW   r$    s8    
,

	z%CachedDatasetModuleFactory.get_module)N)	rr   rs   rt   ru   r   r
   r0  r   r$  rV   rV   rV   rW   rp    s    	rp  c                   @   s<   e Zd ZdZedd
eee dddZeddd	Z	dS )CachedMetricModuleFactoryu6  
    Get the module of a metric that has been loaded once already and cached.
    The script that is loaded from the cache is the most recent one with a matching name.

    <Deprecated version="2.5.0">

    Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate

    </Deprecated>
    r'  Nrq  c                 C   s$   || _ || _| j ddks td S )Nr   r   rr  rs  rV   rV   rW   r0    s    z"CachedMetricModuleFactory.__init__rY   c              	      s   j rj nt }tj|dj tj rDdd t D nd }|sbtdj d|  fdd}t	||dd	 }t
d
tj | dt|| dj d dtj|dj|jg}t  t||S )Nr6  c                 S   s   g | ]}t |d kr|qS rt  rv  rw  rV   rV   rW   r   %  s      z8CachedMetricModuleFactory.get_module.<locals>.<listcomp>zMetric ry  c                    s   t  |  jd   jS )Nr   )r   rI   rz  r{  r|  r}  rV   rW   r~  -  s    zDCachedMetricModuleFactory.get_module.<locals>._get_modification_timer  r   r  r  r  r  r   )rU   rX   rO   rP   rQ   rI   r   r  r   r   r   r  r  r  r   r[   r8  r  )rn   rU   r  r~  r   rd   rV   r}  rW   r$  !  s     
, z$CachedMetricModuleFactory.get_module)N)
rr   rs   rt   ru   r6   r   r
   r0  r  r$  rV   rV   rV   rW   r    s    	r  )rP   r)  r   r   rU   r	  r   rZ   c                 K   sn  |dkrt f |}t|ptj}d|_d|_|tjk|_ttdd | 	t
jddd }|dsp|d }t
j| |}	| tkrt| ||||d S | |rt
j| rt| ||d	 S td
t|  nt
j|	 rt|	||d	 S t
j| rt| |||d S t| rV| ddkrVzRt  ttj}
z|
j| ||j dd}W n t!k
r> } zt"|t#t$j%j&t$j%j'frt'd|  dt(|j) dndt*|krd|  d}t|r|d| d n|nFdt*|kr*d|  d}|r|d| d n|}t|d n|W 5 d}~X Y nX |dd |j+D krnt,| ||||d W S t-| |||||d W S W n t!k
rR } zzt.| |d W  W Y S  t!k
r@   t"|t#rt'd|  d| dt"|t/r|dt"|tr6td
t|	 d |  d!t(|j) d"| d|dY nX W 5 d}~X Y nX ntd
t|	 d#dS )$a"  
    Download/extract/cache a dataset module.

    Dataset codes are cached inside the dynamic modules cache to allow easy import (avoid ugly sys.path tweaks).

    Args:

        path (str): Path or name of the dataset.
            Depending on ``path``, the dataset builder that is used comes from a generic dataset script (JSON, CSV, Parquet, text etc.) or from the dataset script (a python file) inside the dataset directory.

            For local datasets:

            - if ``path`` is a local directory (containing data files only)
              -> load a generic dataset builder (csv, json, text etc.) based on the content of the directory
              e.g. ``'./path/to/directory/with/my/csv/data'``.
            - if ``path`` is a local dataset script or a directory containing a local dataset script (if the script has the same name as the directory):
              -> load the dataset builder from the dataset script
              e.g. ``'./dataset/squad'`` or ``'./dataset/squad/squad.py'``.

            For datasets on the Hugging Face Hub (list all available datasets with ``huggingface_hub.list_datasets()``)

            - if ``path`` is a dataset repository on the HF hub (containing data files only)
              -> load a generic dataset builder (csv, text etc.) based on the content of the repository
              e.g. ``'username/dataset_name'``, a dataset repository on the HF hub containing your data files.
            - if ``path`` is a dataset repository on the HF hub with a dataset script (if the script has the same name as the directory)
              -> load the dataset builder from the dataset script in the dataset repository
              e.g. ``glue``, ``squad``, ``'username/dataset_name'``, a dataset repository on the HF hub containing a dataset script `'dataset_name.py'`.

        revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset script to load.
            As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
            You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
        download_config (:class:`DownloadConfig`, optional): Specific download configuration parameters.
        download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
        dynamic_modules_path (Optional str, defaults to HF_MODULES_CACHE / "datasets_modules", i.e. ~/.cache/huggingface/modules/datasets_modules):
            Optional path to the directory in which the dynamic modules are saved. It must have been initialized with :obj:`init_dynamic_modules`.
            By default, the datasets and metrics are stored inside the `datasets_modules` module.
        data_dir (:obj:`str`, optional): Directory with the data files. Used only if `data_files` is not specified,
            in which case it's equal to pass `os.path.join(data_dir, "**")` as `data_files`.
        data_files (:obj:`Union[Dict, List, str]`, optional): Defining the data_files of the dataset configuration.
        **download_kwargs (additional keyword arguments): optional attributes for DownloadConfig() which will override
            the attributes in download_config if supplied.

    Returns:
        DatasetModule
    NTc                 S   s   | S r   rV   xrV   rV   rW   <lambda>z      z(dataset_module_factory.<locals>.<lambda>r   r   r   )r	  r   r   r   r   rU   z"Couldn't find a dataset script at )r	  r   r   r   r_  )re  r)  r`  ra  zCouldn't reach 'z' on the Hub ()Z404z	Dataset 'z' doesn't exist on the Hubz at revision ''Z401zT. If the repo is private or gated, make sure to log in with `huggingface-cli login`.c                 S   s   g | ]
}|j qS rV   )Z	rfilename)r   ZsiblingrV   rV   rW   r     s     z*dataset_module_factory.<locals>.<listcomp>r)  r   r   rU   )r)  r	  r   r   r   rU   z1Couldn't reach the Hugging Face Hub for dataset 'z': z8 or any data file in the same directory. Couldn't find 'z"' on the Hugging Face Hub either: r   z( or any data file in the same directory.)0r    r!   REUSE_DATASET_IF_EXISTSextract_compressed_fileforce_extractr   Zforce_downloadr   filterr   rO   sepr   r   rP   rQ   r3   r[  r$  r   r>  r   r>   r   rE  r=   r   r8   r   r   rg  rh  r`  r   
isinstancer7   requests
exceptionsConnectTimeoutrm  typerr   r   Zsiblingsrk  r^  rp  r   )rP   r)  r   r   rU   r	  r   download_kwargsr   combined_pathZhf_apirh  r  msge1rV   rV   rW   dataset_module_factory<  s    7
&


  
  
   


 $ r  r'  )rP   r)  r   r   rU   rZ   c           	      K   s  t   t jddtd |dkr.tf |}t|p8tj}d|_d|_t	t
dd | tjddd	 }|d
s|d
 }tj| |}| |rtj| rt| ||d W  5 Q R  S tdt|  ntj|rt|||d W  5 Q R  S t| r| ddkrz$t| ||||d W W  5 Q R  S  tk
r } zpz&t| |d W  W Y VW  5 Q R  S  tk
r   t|ts|dtdt| d|  ddY nX W 5 d}~X Y nX ntdt| dW 5 Q R X dS )u  
    Download/extract/cache a metric module.

    <Deprecated version="2.5.0">

    Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate

    </Deprecated>

    Metrics codes are cached inside the dynamic modules cache to allow easy import (avoid ugly sys.path tweaks).

    Args:

        path (str): Path or name of the metric script.

            - if ``path`` is a local metric script or a directory containing a local metric script (if the script has the same name as the directory):
              -> load the module from the metric script
              e.g. ``'./metrics/accuracy'`` or ``'./metrics/accuracy/accuracy.py'``.
            - if ``path`` is a metric on the Hugging Face Hub (ex: `glue`, `squad`)
              -> load the module from the metric script in the GitHub repository at huggingface/datasets
              e.g. ``'accuracy'`` or ``'rouge'``.

        revision (Optional ``Union[str, datasets.Version]``):
            If specified, the module will be loaded from the datasets repository at this version.
            By default:
            - it is set to the local version of the lib.
            - it will also try to load it from the main branch if it's not available at the local version of the lib.
            Specifying a version that is different from your local version of the lib might cause compatibility issues.
        download_config (:class:`DownloadConfig`, optional): Specific download configuration parameters.
        download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
        dynamic_modules_path (Optional str, defaults to HF_MODULES_CACHE / "datasets_modules", i.e. ~/.cache/huggingface/modules/datasets_modules):
            Optional path to the directory in which the dynamic modules are saved. It must have been initialized with :obj:`init_dynamic_modules`.
            By default, the datasets and metrics are stored inside the `datasets_modules` module.
        **download_kwargs (additional keyword arguments): optional attributes for DownloadConfig() which will override
            the attributes in download_config if supplied.

    Returns:
        MetricModule
    ignore'.*https://huggingface.co/docs/evaluate$messagecategoryNTc                 S   s   | S r   rV   r  rV   rV   rW   r  )  r  z'metric_module_factory.<locals>.<lambda>r   r   r   r  z!Couldn't find a metric script at r   r  r  z
. Metric 'z/' doesn't exist on the Hugging Face Hub either.r   )warningscatch_warningsfilterwarningsFutureWarningr    r!   r  r  r  r   r  r   rO   r  r   r   rP   rQ   r   r9  r$  r   r>   r=   r   r&  r   r  r  )	rP   r)  r   r   rU   r  r   r  r  rV   rV   rW   metric_module_factory  s`    0
&

    &r  ue   Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluateF)rP   r   
process_idnum_process	cache_direxperiment_idkeep_in_memoryr   r   r)  rZ   c
              
   K   s   t  v t jddtd t|p$tj}t| |	||dj}t|dd}|f ||||||d|
}|j	|d |W  5 Q R  S Q R X d	S )
u(	  Load a `datasets.Metric`.

    <Deprecated version="2.5.0">

    Use `evaluate.load` instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate

    </Deprecated>

    Args:

        path (``str``):
            path to the metric processing script with the metric builder. Can be either:
                - a local path to processing script or the directory containing the script (if the script has the same name as the directory),
                    e.g. ``'./metrics/rouge'`` or ``'./metrics/rogue/rouge.py'``
                - a metric identifier on the HuggingFace datasets repo (list all available metrics with ``datasets.list_metrics()``)
                    e.g. ``'rouge'`` or ``'bleu'``
        config_name (:obj:`str`, optional): selecting a configuration for the metric (e.g. the GLUE metric has a configuration for each subset)
        process_id (:obj:`int`, optional): for distributed evaluation: id of the process
        num_process (:obj:`int`, optional): for distributed evaluation: total number of processes
        cache_dir (Optional str): path to store the temporary predictions and references (default to `~/.cache/huggingface/metrics/`)
        experiment_id (``str``): A specific experiment id. This is used if several distributed evaluations share the same file system.
            This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
        keep_in_memory (bool): Whether to store the temporary results in memory (defaults to False)
        download_config (Optional ``datasets.DownloadConfig``: specific download configuration parameters.
        download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
        revision (Optional ``Union[str, datasets.Version]``): if specified, the module will be loaded from the datasets repository
            at this version. By default, it is set to the local version of the lib. Specifying a version that is different from
            your local version of the lib might cause compatibility issues.

    Returns:
        `datasets.Metric`

    Example:

    ```py
    >>> from datasets import load_metric
    >>> accuracy = load_metric('accuracy')
    >>> accuracy.compute(references=[1, 0], predictions=[1, 1])
    {'accuracy': 0.5}
    ```
    r  r  r  )r)  r   r   F)re   )r   r  r  r  r  r  r   N)
r  r  r  r  r!   r  r  rd   rh   download_and_prepare)rP   r   r  r  r  r  r  r   r   r)  Zmetric_init_kwargsZmetric_moduleZ
metric_clsr+  rV   rV   rW   load_metricR  s,    7
   r  r6   )rP   rI   r	  r   r  featuresr   r   r)  r`  storage_optionsrZ   c                    s  |
dkr t d|
 dt |
}	t|p*tj}|	dk	rN|rB| nt }|	|_|dk	rt|rb| nt }|j	| t
 |||||d}|j}|d|}|d|}|d|p|jj}|d	d}|d
}|jr|j|nd}|jjr||jjkrt||jj| } tkrd|dkrdd  d} fddtD }|r\|d|d  d7 }t|t||d}|f |||||||||	|d
||}|S )aB  Load a dataset builder from the Hugging Face Hub, or a local dataset. A dataset builder can be used to inspect general information that is required to build a dataset (cache directory, config, dataset info, etc.)
    without downloading the dataset itself.

    You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].

    A dataset is a directory that contains:

    - some data files in generic formats (JSON, CSV, Parquet, text, etc.)
    - and optionally a dataset script, if it requires some code to read the data files. This is used to load any kind of formats or structures.

    Note that dataset scripts can also download and read data files from anywhere - in case your data files already exist online.

    Args:

        path (`str`):
            Path or name of the dataset.
            Depending on `path`, the dataset builder that is used comes from a generic dataset script (JSON, CSV, Parquet, text etc.) or from the dataset script (a python file) inside the dataset directory.

            For local datasets:

            - if `path` is a local directory (containing data files only)
              -> load a generic dataset builder (csv, json, text etc.) based on the content of the directory
              e.g. `'./path/to/directory/with/my/csv/data'`.
            - if `path` is a local dataset script or a directory containing a local dataset script (if the script has the same name as the directory)
              -> load the dataset builder from the dataset script
              e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`.

            For datasets on the Hugging Face Hub (list all available datasets with [`huggingface_hub.list_datasets`])

            - if `path` is a dataset repository on the HF hub (containing data files only)
              -> load a generic dataset builder (csv, text etc.) based on the content of the repository
              e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing your data files.
            - if `path` is a dataset repository on the HF hub with a dataset script (if the script has the same name as the directory)
              -> load the dataset builder from the dataset script in the dataset repository
              e.g. `glue`, `squad`, `'username/dataset_name'`, a dataset repository on the HF hub containing a dataset script `'dataset_name.py'`.

        name (`str`, *optional*):
            Defining the name of the dataset configuration.
        data_dir (`str`, *optional*):
            Defining the `data_dir` of the dataset configuration. If specified for the generic builders (csv, text etc.) or the Hub datasets and `data_files` is `None`,
            the behavior is equal to passing `os.path.join(data_dir, **)` as `data_files` to reference all the files in a directory.
        data_files (`str` or `Sequence` or `Mapping`, *optional*):
            Path(s) to source data file(s).
        cache_dir (`str`, *optional*):
            Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`.
        features ([`Features`], *optional*):
            Set the features type to use for this dataset.
        download_config ([`DownloadConfig`], *optional*):
            Specific download configuration parameters.
        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
            Download/generate mode.
        revision ([`Version`] or `str`, *optional*):
            Version of the dataset script to load.
            As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
            You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
        token (`str` or `bool`, *optional*):
            Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
            If `True`, or not specified, will get token from `"~/.huggingface"`.
        use_auth_token (`str` or `bool`, *optional*):
            Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
            If `True`, or not specified, will get token from `"~/.huggingface"`.

            <Deprecated version="2.14.0">

            `use_auth_token` was deprecated in favor of `token` in version 2.14.0 and will be removed in 3.0.0.

            </Deprecated>
        storage_options (`dict`, *optional*, defaults to `None`):
            **Experimental**. Key/value pairs to be passed on to the dataset file-system backend, if any.

            <Added version="2.11.0"/>
        **config_kwargs (additional keyword arguments):
            Keyword arguments to be passed to the [`BuilderConfig`]
            and used in the [`DatasetBuilder`].

    Returns:
        [`DatasetBuilder`]

    Example:

    ```py
    >>> from datasets import load_dataset_builder
    >>> ds_builder = load_dataset_builder('rotten_tomatoes')
    >>> ds_builder.info.features
    {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
     'text': Value(dtype='string', id=None)}
    ```
    r6   'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.
You can remove this warning by passing 'token=
' instead.N)r)  r   r   r	  r   r	  r   r   rk   r   z@Please specify the data files or data directory to load for the z dataset builder.c                    s   g | ]}t |  kr|qS rV   )r0   )r   	extensionrP   rV   rW   r   3  s     z(load_dataset_builder.<locals>.<listcomp>z9
For example `data_files={"train": "path/to/data/train/*.r   z"}`)rk   )
r  rk   r   r	  r   r   infor  r`  r  )r  warnr  r!   r  rz   r    r`  r  r  r  r  rU  r   rj   r  r  rp   r  r3   r0   r   r   )rP   rI   r	  r   r  r  r   r   r)  r`  use_auth_tokenr  config_kwargsr   r  r   rk   r   r  	error_msgZexample_extensionsro   builder_instancerV   r  rW   load_dataset_builder  s    g
	 


 

r  )rP   rI   r	  r   r   r  r  r   r   verification_moder  
save_infosr)  r`  	streamingnum_procr  rZ   c                 K   s  |dkr t d| dt |}|
dkrN|
r2tjntj}	t d|	j dt |dkrdt dt nd}|dk	r|std| dt| t	j
 rtd	|r|dk	rtd
t|ptj}t|s|	ptjntj}	tf | ||||||||||d|}|r|j|dS | tk}|j|||	|||d |dk	r:|n
t|jj}|j||	|d}|dk	rt   t dt ||}W 5 Q R X |r|  |S )a>)  Load a dataset from the Hugging Face Hub, or a local dataset.

    You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].

    A dataset is a directory that contains:

    - some data files in generic formats (JSON, CSV, Parquet, text, etc.).
    - and optionally a dataset script, if it requires some code to read the data files. This is used to load any kind of formats or structures.

    Note that dataset scripts can also download and read data files from anywhere - in case your data files already exist online.

    This function does the following under the hood:

        1. Download and import in the library the dataset script from `path` if it's not already cached inside the library.

            If the dataset has no dataset script, then a generic dataset script is imported instead (JSON, CSV, Parquet, text, etc.)

            Dataset scripts are small python scripts that define dataset builders. They define the citation, info and format of the dataset,
            contain the path or URL to the original data files and the code to load examples from the original data files.

            You can find the complete list of datasets in the Datasets [Hub](https://huggingface.co/datasets).

        2. Run the dataset script which will:

            * Download the dataset file from the original URL (see the script) if it's not already available locally or cached.
            * Process and cache the dataset in typed Arrow tables for caching.

                Arrow table are arbitrarily long, typed tables which can store nested objects and be mapped to numpy/pandas/python generic types.
                They can be directly accessed from disk, loaded in RAM or even streamed over the web.

        3. Return a dataset built from the requested splits in `split` (default: all).

    It also allows to load a dataset from a local directory or a dataset repository on the Hugging Face Hub without dataset script.
    In this case, it automatically loads all the data files from the directory or the dataset repository.

    Args:

        path (`str`):
            Path or name of the dataset.
            Depending on `path`, the dataset builder that is used comes from a generic dataset script (JSON, CSV, Parquet, text etc.) or from the dataset script (a python file) inside the dataset directory.

            For local datasets:

            - if `path` is a local directory (containing data files only)
              -> load a generic dataset builder (csv, json, text etc.) based on the content of the directory
              e.g. `'./path/to/directory/with/my/csv/data'`.
            - if `path` is a local dataset script or a directory containing a local dataset script (if the script has the same name as the directory)
              -> load the dataset builder from the dataset script
              e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`.

            For datasets on the Hugging Face Hub (list all available datasets with [`huggingface_hub.list_datasets`])

            - if `path` is a dataset repository on the HF hub (containing data files only)
              -> load a generic dataset builder (csv, text etc.) based on the content of the repository
              e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing your data files.
            - if `path` is a dataset repository on the HF hub with a dataset script (if the script has the same name as the directory)
              -> load the dataset builder from the dataset script in the dataset repository
              e.g. `glue`, `squad`, `'username/dataset_name'`, a dataset repository on the HF hub containing a dataset script `'dataset_name.py'`.

        name (`str`, *optional*):
            Defining the name of the dataset configuration.
        data_dir (`str`, *optional*):
            Defining the `data_dir` of the dataset configuration. If specified for the generic builders (csv, text etc.) or the Hub datasets and `data_files` is `None`,
            the behavior is equal to passing `os.path.join(data_dir, **)` as `data_files` to reference all the files in a directory.
        data_files (`str` or `Sequence` or `Mapping`, *optional*):
            Path(s) to source data file(s).
        split (`Split` or `str`):
            Which split of the data to load.
            If `None`, will return a `dict` with all splits (typically `datasets.Split.TRAIN` and `datasets.Split.TEST`).
            If given, will return a single Dataset.
            Splits can be combined and specified like in tensorflow-datasets.
        cache_dir (`str`, *optional*):
            Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`.
        features (`Features`, *optional*):
            Set the features type to use for this dataset.
        download_config ([`DownloadConfig`], *optional*):
            Specific download configuration parameters.
        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
            Download/generate mode.
        verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
            Verification mode determining the checks to run on the downloaded/processed dataset information (checksums/size/splits/...).

            <Added version="2.9.1"/>
        ignore_verifications (`bool`, defaults to `False`):
            Ignore the verifications of the downloaded/processed dataset information (checksums/size/splits/...).

            <Deprecated version="2.9.1">

            `ignore_verifications` was deprecated in version 2.9.1 and will be removed in 3.0.0.
            Please use `verification_mode` instead.

            </Deprecated>
        keep_in_memory (`bool`, defaults to `None`):
            Whether to copy the dataset in-memory. If `None`, the dataset
            will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to
            nonzero. See more details in the [improve performance](../cache#improve-performance) section.
        save_infos (`bool`, defaults to `False`):
            Save the dataset information (checksums/size/splits/...).
        revision ([`Version`] or `str`, *optional*):
            Version of the dataset script to load.
            As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
            You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
        token (`str` or `bool`, *optional*):
            Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
            If `True`, or not specified, will get token from `"~/.huggingface"`.
        use_auth_token (`str` or `bool`, *optional*):
            Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
            If `True`, or not specified, will get token from `"~/.huggingface"`.

            <Deprecated version="2.14.0">

            `use_auth_token` was deprecated in favor of `token` in version 2.14.0 and will be removed in 3.0.0.

            </Deprecated>
        task (`str`):
            The task to prepare the dataset for during training and evaluation. Casts the dataset's [`Features`] to standardized column names and types as detailed in `datasets.tasks`.

            <Deprecated version="2.13.0">

            `task` was deprecated in version 2.13.0 and will be removed in 3.0.0.

            </Deprecated>
        streaming (`bool`, defaults to `False`):
            If set to `True`, don't download the data files. Instead, it streams the data progressively while
            iterating on the dataset. An [`IterableDataset`] or [`IterableDatasetDict`] is returned instead in this case.

            Note that streaming works for datasets that use data formats that support being iterated over like txt, csv, jsonl for example.
            Json files may be downloaded completely. Also streaming from remote zip or gzip files is supported but other compressed formats
            like rar and xz are not yet supported. The tgz format doesn't allow streaming.
        num_proc (`int`, *optional*, defaults to `None`):
            Number of processes when downloading and generating the dataset locally.
            Multiprocessing is disabled by default.

            <Added version="2.7.0"/>
        storage_options (`dict`, *optional*, defaults to `None`):
            **Experimental**. Key/value pairs to be passed on to the dataset file-system backend, if any.

            <Added version="2.11.0"/>
        **config_kwargs (additional keyword arguments):
            Keyword arguments to be passed to the `BuilderConfig`
            and used in the [`DatasetBuilder`].

    Returns:
        [`Dataset`] or [`DatasetDict`]:
        - if `split` is not `None`: the dataset requested,
        - if `split` is `None`, a [`~datasets.DatasetDict`] with each split.

        or [`IterableDataset`] or [`IterableDatasetDict`]: if `streaming=True`

        - if `split` is not `None`, the dataset is requested
        - if `split` is `None`, a [`~datasets.streaming.IterableDatasetDict`] with each split.

    Example:

    Load a dataset from the Hugging Face Hub:

    ```py
    >>> from datasets import load_dataset
    >>> ds = load_dataset('rotten_tomatoes', split='train')

    # Map data files to splits
    >>> data_files = {'train': 'train.csv', 'test': 'test.csv'}
    >>> ds = load_dataset('namespace/your_dataset_name', data_files=data_files)
    ```

    Load a local dataset:

    ```py
    # Load a CSV file
    >>> from datasets import load_dataset
    >>> ds = load_dataset('csv', data_files='path/to/local/my_dataset.csv')

    # Load a JSON file
    >>> from datasets import load_dataset
    >>> ds = load_dataset('json', data_files='path/to/local/my_dataset.json')

    # Load from a local loading script
    >>> from datasets import load_dataset
    >>> ds = load_dataset('path/to/local/loading_script/loading_script.py', split='train')
    ```

    Load an [`~datasets.IterableDataset`]:

    ```py
    >>> from datasets import load_dataset
    >>> ds = load_dataset('rotten_tomatoes', split='train', streaming=True)
    ```

    Load an image dataset with the `ImageFolder` dataset builder:

    ```py
    >>> from datasets import load_dataset
    >>> ds = load_dataset('imagefolder', data_dir='/path/to/images', split='train')
    ```
    r6   r  r  z'ignore_verifications' was deprecated in favor of 'verification_mode' in version 2.9.1 and will be removed in 3.0.0.
You can remove this warning by passing 'verification_mode=zF'task' was deprecated in version 2.13.0 and will be removed in 3.0.0.
NzEmpty 'data_files': 'z3'. It should be either non-empty or None (default).zjYou are trying to load a dataset that was saved using `save_to_disk`. Please use `load_from_disk` instead.zLoading a streaming dataset in parallel with `num_proc` is not implemented. To parallelize streaming, you can wrap the dataset with a PyTorch DataLoader using `num_workers` > 1 instead.)rP   rI   r	  r   r  r  r   r   r)  r`  r  r   )r   r   r  try_from_hf_gcsr  r  )r   r  Z	in_memoryr  )r  r  r  rB   Z	NO_CHECKSZ
ALL_CHECKSr  r   r   r   DATASET_STATE_JSON_FILENAMErS   r"  r!   r  ZBASIC_CHECKSr  Zas_streaming_datasetr3   r  rC   r  Zdataset_sizeZ
as_datasetr  simplefilterZprepare_for_taskZ_save_infos)rP   rI   r	  r   r   r  r  r   r   r  Zignore_verificationsr  r  r)  r`  r  taskr  r  r  r  r  r  ZdsrV   rV   rW   load_datasetN  s     [


r  )dataset_pathr  r  rZ   c                 C   s   |dkrt dt |j}tj| |d}|d }t|rHt| }tj	}nt
d}| }tjj	}||sxtd|  d|||tjr|||tjrtj| ||dS |||tjrtj| ||dS td|  d	d
S )a  
    Loads a dataset that was previously saved using [`~Dataset.save_to_disk`] from a dataset directory, or
    from a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.

    Args:
        dataset_path (`str`):
            Path (e.g. `"dataset/train"`) or remote URI (e.g.
            `"s3://my-bucket/dataset/train"`) of the [`Dataset`] or [`DatasetDict`] directory where the dataset will be
            loaded from.
        fs (`~filesystems.S3FileSystem` or `fsspec.spec.AbstractFileSystem`, *optional*):
            Instance of the remote filesystem used to download the files from.

            <Deprecated version="2.9.0">

            `fs` was deprecated in version 2.9.0 and will be removed in 3.0.0.
            Please use `storage_options` instead, e.g. `storage_options=fs.storage_options`.

            </Deprecated>

        keep_in_memory (`bool`, defaults to `None`):
            Whether to copy the dataset in-memory. If `None`, the dataset
            will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to
            nonzero. See more details in the [improve performance](../cache#improve-performance) section.

        storage_options (`dict`, *optional*):
            Key/value pairs to be passed on to the file-system backend, if any.

            <Added version="2.9.0"/>

    Returns:
        [`Dataset`] or [`DatasetDict`]:
        - If `dataset_path` is a path of a dataset directory: the dataset requested.
        - If `dataset_path` is a path of a dataset dict directory, a [`DatasetDict`] with each split.

    Example:

    ```py
    >>> from datasets import load_from_disk
    >>> ds = load_from_disk('path/to/dataset/directory')
    ```
    r6   z'fs' was deprecated in favor of 'storage_options' in version 2.9.0 and will be removed in 3.0.0.
You can remove this warning by passing 'storage_options=fs.storage_options' instead.)r  r   filez
Directory z
 not found)r  r  z@ is neither a `Dataset` directory nor a `DatasetDict` directory.N)r  r  r  r  fsspecZget_fs_token_pathsr(   r'   	posixpathrQ   
filesystemrO   rP   rS   r   r   r   ZDATASET_INFO_FILENAMEr  r   load_from_diskZDATASETDICT_JSON_FILENAMEr   )r  fsr  r  Zfs_token_pathsZdest_dataset_pathZ	path_joinrV   rV   rW   r    s2    ,



r  )T)N)N)N)NN)NNN)NNNNNN)NNNN)	Nr   r   NNFNNN)NNNNNNNNNr6   N)NNNNNNNNNr6   NFNNr6   r6   FNN)r6   NN)ru   r   r[   r_   r   rO   r  r   r  r  collectionsr   dataclassesr   r   pathlibr   typingr   r   r   r	   r
   r   r   r   r   r  r  Zhuggingface_hubr   r   r   r  r   Zarrow_datasetr   Zbuilderr   r   r   r   r   r   r   r   r   r   Zdataset_dictr   r   Zdownload.download_configr    Zdownload.download_managerr!   Z#download.streaming_download_managerr"   r#   r$   r%   r  r&   Zfilesystemsr'   r(   fingerprintr)   r  r*   r+   Ziterable_datasetr,   r+  r-   Znamingr.   r/   Zpackaged_modulesr0   r1   r2   r3   r4   Zsplitsr5   Zutils.deprecation_utilsr6   Zutils.file_utilsr7   r8   r9   r:   r;   r<   r=   r>   r?   Zutils.filelockr@   Z	utils.hubrA   Zutils.info_utilsrB   rC   Zutils.loggingrD   Zutils.metadatarE   Zutils.py_utilsrF   Zutils.versionrG   rr   r   r   r   r  ZMODULE_NAME_FOR_DYNAMIC_MODULESr   rX   rh   ri   rl   r   r   r   r   r   r   r   r   r   r  r  r   r  r  r   r  r   r%  r&  r9  r>  rE  r[  r^  rk  rp  r  r  r  r   r  r  r  r  rV   rV   rV   rW   <module>   s  ,$	,  $)  
   BW
  &  '        
KK11o7 ^93       4    b         Q           ( /                   (  8       
