U
    -e}V                     @   s4  d Z ddlZddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z* ddl+m,Z, e$e-Z.dddddddddddddgZ/e01dde01dde01d de01d!d"e01d#d$e01d%d&e01d'd(e01d)d*iZ2d+d,iZ3e4d-d. ee2e3D Z5G d/d0 d0ej6Z7G d1d2 d2eZ8e9e9d3d4d5Z:ee9 d6d7d8Z;e9ee9 d3d9d:Z<G d;d< d<eZ=G d=d> d>e=Z>G d?d@ d@e=Z?G dAdB dBZ@dS )CzDownload manager interface.    N)datetime)partial)chain)CallableDict	GeneratorIterableListOptionalTupleUnion   )config)DeprecatedEnum
deprecated)cached_pathget_from_cachehash_url_to_filenameis_relative_pathurl_or_path_join)get_size_checksum_dict)
get_loggeris_progress_bar_enabledtqdm)NestedDataStructure
map_nestedsize_str   )DownloadConfigtxtcsvjsonZjsonlZtsvZconllZconlluorigZparquetZpklpicklerelxmlZ504B0304zipZ504B0506Z504B0708Z425A68bz2Z1F8BgzipZFD377A585A00xzZ04224D18Zlz4Z28B52FFDZzstds   Rar!Zrarc                 c   s   | ]}t |V  qd S N)len).0magic_number r.   c/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/datasets/download/download_manager.py	<genexpr>F   s   r0   c                   @   s   e Zd ZdZdZdZdZdS )DownloadModea)  `Enum` for how to treat pre-existing downloads and data.

    The default mode is `REUSE_DATASET_IF_EXISTS`, which will reuse both
    raw downloads and the prepared dataset if they exist.

    The generations modes:

    |                                     | Downloads | Dataset |
    |-------------------------------------|-----------|---------|
    | `REUSE_DATASET_IF_EXISTS` (default) | Reuse     | Reuse   |
    | `REUSE_CACHE_IF_EXISTS`             | Reuse     | Fresh   |
    | `FORCE_REDOWNLOAD`                  | Fresh     | Fresh   |

    reuse_dataset_if_existsreuse_cache_if_existsforce_redownloadN)__name__
__module____qualname____doc__REUSE_DATASET_IF_EXISTSREUSE_CACHE_IF_EXISTSFORCE_REDOWNLOADr.   r.   r.   r/   r1   L   s   r1   c                   @   s$   e Zd ZdZdZdZedd ZdS )GenerateModer2   r3   r4   c                 C   s   dS )NzUse 'DownloadMode' instead.r.   selfr.   r.   r/   help_messagef   s    zGenerateMode.help_messageN)r5   r6   r7   r9   r:   r;   propertyr?   r.   r.   r.   r/   r<   a   s
   r<   )pathreturnc                 C   s*   |  dd }dD ]}| |d }q|S )N.z?-_r   )split)rA   	extensionZsymbr.   r.   r/   _get_path_extensionk   s    rG   rB   c              	   C   s   z|  d W n ttjfk
r*   Y dS X | t}|  d ttD ]X}t|dt|  }|dk	rr|  S t	|dt|  }|dk	rHt
d| dqHdS )zQread the magic number from a file-like object and return the compression protocolr   NzCompression protocol 'z' not implemented.)seekAttributeErrorioUnsupportedOperationreadMAGIC_NUMBER_MAX_LENGTHrange$MAGIC_NUMBER_TO_COMPRESSION_PROTOCOLget0MAGIC_NUMBER_TO_UNSUPPORTED_COMPRESSION_PROTOCOLNotImplementedError)fr-   icompressionr.   r.   r/   *_get_extraction_protocol_with_magic_numberu   s    

rW   c              
   C   sX   t | } t| }|tks*|dks*| dr.d S t| d}t|W  5 Q R  S Q R X d S )N)tgztar)z.tar.gzz.tar.bz2z.tar.xzrb)strrG   BASE_KNOWN_EXTENSIONSendswithopenrW   )rA   rF   rT   r.   r.   r/   _get_extraction_protocol   s    r_   c                   @   s&   e Zd ZdZedddZdd ZdS )_IterableFromGeneratorzkUtility class to create an iterable from a generator function, in order to reset the generator when needed.)	generatorc                 O   s   || _ || _|| _d S r*   ra   argskwargs)r>   ra   rc   rd   r.   r.   r/   __init__   s    z_IterableFromGenerator.__init__c                 c   s   | j | j| jE d H  d S r*   rb   r=   r.   r.   r/   __iter__   s    z_IterableFromGenerator.__iter__N)r5   r6   r7   r8   r   re   rf   r.   r.   r.   r/   r`      s   r`   c                   @   s   e Zd ZdZedd Zedd Zeee	ddf ddd	Z
eeee	ddf d
ddZed dddZed dddZdS )ArchiveIterablezIAn iterable of (path, fileobj) from a TAR archive, used by `iter_archive`c                 c   sf   t j| dd}|D ]L}|j}| s&q|d kr0qtj|drDq||}||fV  g |_	q~d S )Nzr|*)fileobjmoderC   __)
tarfiler^   nameisregosrA   basename
startswithextractfilemembers)rT   streamtarinfo	file_pathfile_objr.   r.   r/   	_iter_tar   s    

zArchiveIterable._iter_tarc                 c   s^   t | }| D ]F}|j}| r&q|d kr0qtj|drDq|	|}||fV  qd S )Nrj   )
zipfileZipFileinfolistfilenameis_dirro   rA   rp   rq   r^   )rT   Zzipfmemberrv   rw   r.   r.   r/   	_iter_zip   s    

zArchiveIterable._iter_zipNrH   c                 c   s6   t |}|dkr"| |E d H  n| |E d H  d S )Nr&   )rW   r   rx   )clsrT   rV   r.   r.   r/   _iter_from_fileobj   s    z"ArchiveIterable._iter_from_fileobj)urlpathrB   c              	   c   sL   t |}t|d0}|dkr.| |E d H  n| |E d H  W 5 Q R X d S )NrZ   r&   )r_   r^   r   rx   )r   r   rV   rT   r.   r.   r/   _iter_from_path   s
    zArchiveIterable._iter_from_pathc                 C   s   | | j |S r*   )r   )r   rh   r.   r.   r/   from_buf   s    zArchiveIterable.from_bufc                 C   s   | | j |S r*   )r   )r   Zurlpath_or_bufr.   r.   r/   	from_path   s    zArchiveIterable.from_path)r5   r6   r7   r8   staticmethodrx   r   classmethodr   r   r   r[   r   r   r   r.   r.   r.   r/   rg      s   

rg   c                   @   sL   e Zd ZdZeeeee f eeddf dddZ	ed dddZ
dS )	FilesIterablez8An iterable of paths from a list of directories or filesN)urlpathsrB   c                 c   s   t |ts|g}|D ]}tj|r@tj|dr8q|V  qt|D ]b\}}}tdd |D |d d < tj|drqJt|D ] }|drqtj	||V  qqJqd S )Nrj   c                 S   s   g | ]}| d s|qS )rj   )rq   )r,   dirnamer.   r.   r/   
<listcomp>   s     
 z2FilesIterable._iter_from_paths.<locals>.<listcomp>)

isinstancelistro   rA   isfilerp   rq   walksortedjoin)r   r   r   dirpathdirnames	filenamesr|   r.   r.   r/   _iter_from_paths   s    

zFilesIterable._iter_from_pathsrH   c                 C   s   | | j |S r*   )r   )r   r   r.   r.   r/   
from_paths   s    zFilesIterable.from_paths)r5   r6   r7   r8   r   r   r[   r	   r   r   r   r.   r.   r.   r/   r      s
   (r   c                   @   s   e Zd ZdZd)ee ee ee ee dddZedd Z	ed	d
 Z
edd ZeedddZeddd Zdd ZeeedddZeeejf dddZeeee f dddZd*dd Zd!d" Zd#d$ Zd%d& Zd'd( ZdS )+DownloadManagerFNT)dataset_namedata_dirdownload_config	base_pathc                 C   sF   || _ || _|ptjd| _i | _|| _|p2t | _	i | _
i | _dS )a4  Download manager constructor.

        Args:
            data_dir:
                can be used to specify a manual directory to get the files from.
            dataset_name (`str`):
                name of dataset this instance will be used for. If
                provided, downloads will contain which datasets they were used for.
            download_config (`DownloadConfig`):
                to specify the cache directory and other
                download options
            base_path (`str`):
                base path that is used when relative paths are used to
                download files. This can be a remote url.
            record_checksums (`bool`, defaults to `True`):
                Whether to record the checksums of the downloaded files. If None, the value is inferred from the builder.
        rC   N)Z_dataset_name	_data_dirro   rA   abspath
_base_path_recorded_sizes_checksumsrecord_checksumsr   r   downloaded_pathsextracted_paths)r>   r   r   r   r   r   r.   r.   r/   re     s    zDownloadManager.__init__c                 C   s   | j S r*   )r   r=   r.   r.   r/   
manual_dir$  s    zDownloadManager.manual_dirc                 C   s   t dd | j D S )z+Returns the total size of downloaded files.c                 s   s   | ]}|d  V  qdS )	num_bytesNr.   )r,   Zchecksums_dictr.   r.   r/   r0   +  s     z2DownloadManager.downloaded_size.<locals>.<genexpr>)sumr   valuesr=   r.   r.   r/   downloaded_size(  s    zDownloadManager.downloaded_sizec                    sX   ddl m |j d  dkr,td fddtfdd	| t  d
}|S )a  Ship the files using Beam FileSystems to the pipeline temp dir.

        Args:
            downloaded_path_or_paths (`str` or `list[str]` or `dict[str, str]`):
                Nested structure containing the
                downloaded path(s).
            pipeline ([`utils.beam_utils.BeamPipeline`]):
                Apache Beam Pipeline.

        Returns:
            `str` or `list[str]` or `dict[str, str]`
        r   )upload_local_to_remoteZtemp_locationNzFYou need to specify 'temp_location' in PipelineOptions to upload filesc              	      sP   t  tjtj| }td|  dt	tj
|  d| d | | |S )Nz
Uploading z (z) to rC   )	posixpathr   r   ZDOWNLOADED_DATASETS_DIRro   rA   rp   loggerinfor   getsize)local_file_pathZremote_file_path)
remote_dirr   r.   r/   uploadA  s      
"
z8DownloadManager.ship_files_with_pipeline.<locals>.uploadc                    s    | S r*   r.   )r   )r   r.   r/   <lambda>L      z:DownloadManager.ship_files_with_pipeline.<locals>.<lambda>disable_tqdm)Zutils.beam_utilsr   _optionsZget_all_optionsrQ   
ValueErrorr   r   )downloaded_path_or_pathsZpipelineZuploaded_path_or_pathsr.   )r   r   r   r/   ship_files_with_pipeline-  s    

z(DownloadManager.ship_files_with_pipeline)url_or_urlsr   c                 C   sP   d}t tt| | |dt  dD ] \}}t|| jd| jt|< q*dS )z)Record size/checksum of downloaded files.   zComputing checksums)delaydescdisable)Zrecord_checksumN)	r   r   r&   flattenr   r   r   r   r[   )r>   r   r   r   urlrA   r.   r.   r/   _record_sizes_checksumsR  s     z'DownloadManager._record_sizes_checksumszCUse `.download`/`.download_and_extract` with `fsspec` URLs instead.c           	   	      s   | j jptj | j j} fdd}t||t  d}t|}t|}t|	 |	 D ]d\}}zt
| dd|d d}W n tk
r   d}Y nX |r| j jrV||| t
| dd|d qV| || |jS )a  
        Download given urls(s) by calling `custom_download`.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download and extract. Each URL is a `str`.
            custom_download (`Callable[src_url, dst_path]`):
                The source URL and destination path. For example
                `tf.io.gfile.copy`, that lets you download from  Google storage.

        Returns:
            downloaded_path(s): `str`, The downloaded paths matching the given input
                `url_or_urls`.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download_custom('s3://my-bucket/data.zip', custom_download_for_my_private_bucket)
        ```
        c                    s   t j t| S r*   )ro   rA   r   r   )r   	cache_dirr.   r/   url_to_downloaded_pathy  s    z?DownloadManager.download_custom.<locals>.url_to_downloaded_pathr   TF)r   Zlocal_files_onlyZuse_etagmax_retries)r   r   r   ZDOWNLOADED_DATASETS_PATHr   r   r   r   r&   r   r   FileNotFoundErrorZforce_downloadr   data)	r>   r   Zcustom_downloadr   r   r   r   rA   cachedr.   r   r/   download_custom`  sB          

    zDownloadManager.download_customc                 C   s   | j  }d|_|jdkr d|_t| j|d}t }t||d|j	t
  dd}t | }td| d	  d
 t|}t|}| jtt| |  t }| || t | }td| d	  d
 |jS )ay  Download given URL(s).

        By default, only one process is used for download. Pass customized `download_config.num_proc` to change this behavior.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download. Each URL is a `str`.

        Returns:
            `str` or `list` or `dict`:
                The downloaded paths matching the given input `url_or_urls`.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        ```
        FNDownloading datar   TzDownloading data files)Z	map_tuplenum_procr   r   zDownloading took <   z minzChecksum Computation took )r   copyextract_compressed_filedownload_descr   	_downloadr   nowr   r   r   r   r   total_secondsr   r   updatedictr&   r   r   r   )r>   r   r   Zdownload_func
start_timer   durationr.   r.   r/   download  s0    

zDownloadManager.download)url_or_filenamer   rB   c                 C   s(   t |}t|rt| j|}t||dS )Nr   )r[   r   r   r   r   )r>   r   r   r.   r.   r/   r     s    zDownloadManager._download)path_or_bufc                 C   s"   t |drt|S t|S dS )aK  Iterate over files within an archive.

        Args:
            path_or_buf (`str` or `io.BufferedReader`):
                Archive path or archive binary file object.

        Yields:
            `tuple[str, io.BufferedReader]`:
                2-tuple (path_within_archive, file_object).
                File object is opened in binary mode.

        Example:

        ```py
        >>> archive = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> files = dl_manager.iter_archive(archive)
        ```
        rM   N)hasattrrg   r   r   )r>   r   r.   r.   r/   iter_archive  s    

zDownloadManager.iter_archive)pathsc                 C   s
   t |S )a  Iterate over file paths.

        Args:
            paths (`str` or `list` of `str`):
                Root paths.

        Yields:
            `str`: File path.

        Example:

        ```py
        >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/beans/resolve/main/data/train.zip')
        >>> files = dl_manager.iter_files(files)
        ```
        )r   r   )r>   r   r.   r.   r/   
iter_files  s    zDownloadManager.iter_filesr   c                 C   s   |dkrt dt | j }d|_|jdkr4d|_ttt	|d||j
t  dd}t|}t|}| jtt| |  |jS )	ak  Extract given path(s).

        Args:
            path_or_paths (path or `list` or `dict`):
                Path of file to extract. Each path is a `str`.
            num_proc (`int`):
                Use multi-processing if `num_proc` > 1 and the length of
                `path_or_paths` is larger than `num_proc`.

                <Deprecated version="2.6.2">

                Pass `DownloadConfig(num_proc=<num_proc>)` to the initializer instead.

                </Deprecated>

        Returns:
            extracted_path(s): `str`, The extracted paths matching the given input
            path_or_paths.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> extracted_files = dl_manager.extract(downloaded_files)
        ```
        r   z'num_proc' was deprecated in version 2.6.2 and will be removed in 3.0.0. Pass `DownloadConfig(num_proc=<num_proc>)` to the initializer instead.TNr   r   zExtracting data files)r   r   r   )warningswarnFutureWarningr   r   r   r   r   r   r   r   r   r   r   r   r   r&   r   r   )r>   Zpath_or_pathsr   r   r   r.   r.   r/   extract  s(    


zDownloadManager.extractc                 C   s   |  | |S )a  Download and extract given `url_or_urls`.

        Is roughly equivalent to:

        ```
        extracted_paths = dl_manager.extract(dl_manager.download(url_or_urls))
        ```

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download and extract. Each URL is a `str`.

        Returns:
            extracted_path(s): `str`, extracted paths of given URL(s).
        )r   r   )r>   r   r.   r.   r/   download_and_extract%  s    z$DownloadManager.download_and_extractc                 C   s
   | j  S r*   )r   r   r=   r.   r.   r/   get_recorded_sizes_checksums7  s    z,DownloadManager.get_recorded_sizes_checksumsc                 C   s^   t | j t | j  }t| j D ].\}}||kr*tj|r*t	| | j|= q*d S r*   )
setr   r   r   r   itemsro   rA   r   remove)r>   Zpaths_to_deletekeyrA   r.   r.   r/   delete_extracted_files:  s
    
z&DownloadManager.delete_extracted_filesc                 C   s   | j jr|   d S r*   )r   Zdelete_extractedr   r=   r.   r.   r/   manage_extracted_filesA  s    z&DownloadManager.manage_extracted_files)NNNNT)r   )r5   r6   r7   Zis_streamingr
   r[   r   re   r@   r   r   r   r   r   r   r   r   r   r   r   rK   BufferedReaderr   r	   r   r   r   r   r   r   r.   r.   r.   r/   r      s<        #


$
00
1r   )Ar8   enumrK   ro   r   rl   r   ry   r   	functoolsr   	itertoolsr   typingr   r   r   r   r	   r
   r   r    r   Zutils.deprecation_utilsr   r   Zutils.file_utilsr   r   r   r   r   Zutils.info_utilsr   Zutils.loggingr   r   r   Zutils.py_utilsr   r   r   r   r   r5   r   r\   bytesfromhexrP   rR   maxrN   Enumr1   r<   r[   rG   rW   r_   r`   rg   r   r   r.   r.   r.   r/   <module>   s   (         

= 