U
    -e~                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZmZmZmZmZmZ d dlmZ d dlZd dl m!Z! d	d
l"m#Z# d	dl$m%Z% d	dl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, d	dl-m.Z. d	dl/m0Z0 ddl1m2Z2 e.e3Z4dddddddddddddgZ5dd e%D d d iZ6d!d" e%D Z7e8d#Z9e:;d$d e:;d%d e:;d&d e:;d'd(e:;d)d*e:;d+d,e:;d-d.e:;d/d0iZ<d1d2iZ=e>d3d4 ee<e=D Z?G d5d6 d6e@ZAd7d8 ZBd9d: ZCdeDee2 d;d<d=ZEd>d? ZFd@dA ZGdBdC ZHdee2 eIdDdEdFZJdee2 eKdDdGdHZLdee2 eIdDdIdJZMddKdLZNdMdN ZOeDeDdOdPdQZPeeD dRdSdTZQdeDee2 eeD dUdVdWZRdeDee2 eeDeeDeeDef f f dUdXdYZSdeDee2 eeDeeDeeDef f f dUdZd[ZTddd]eDee2 d^d_d`ZUdeDee2 eeD dadbdcZVddddeee2 d]dfdgZWdee2 d]dhdiZXG djdk dkeYe ZZeeDeeZf dldmdnZ[dd]ee2 d]dodpZ\dd]ee2 d]dqdrZ]dee2 d]dsdtZ^dee2 d]dudvZ_dee2 d]dwdxZ`dee2 d]dydzZadee2 d]d{d|ZbG d}d~ d~eZcG dd decZdG dd decZeG dd dZfdS )    N)TimeoutError)BytesIO)chain)PathPurePosixPath)	AnyCallableDict	GeneratorIterableListOptionalTupleUnion)ElementTree)ClientError   )config)COMPRESSION_FILESYSTEMS)"get_authentication_headers_for_urlget_datasets_user_agent	http_headis_local_pathis_relative_pathurl_or_path_join)
get_logger)
map_nested   )DownloadConfigtxtcsvjsonZjsonlZtsvZconllZconlluorigZparquetZpklpicklerelxmlc                 C   s   i | ]}|j d |jqS ).)	extensionlstripprotocol.0Zfs_class r,   m/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/datasets/download/streaming_download_manager.py
<dictcomp>6   s     
 r.   zipc                 C   s   h | ]
}|j qS r,   )r)   r*   r,   r,   r-   	<setcomp>:   s     r0   z(?<!:):/Z504B0304Z504B0506Z504B0708Z425A68bz2Z1F8BgzipZFD377A585A00xzZ04224D18Zlz4Z28B52FFDZzstds   Rar!Zrarc                 c   s   | ]}t |V  qd S N)len)r+   magic_numberr,   r,   r-   	<genexpr>K   s   r7   c                   @   s   e Zd ZdS )NonStreamableDatasetErrorN)__name__
__module____qualname__r,   r,   r,   r-   r8   Q   s   r8   c                 G   sP   t | d^} }t| r,tjj| f| S tj| f| } d| g| S dS )u#  
    This function extends os.path.join to support the "::" hop separator. It supports both paths and urls.

    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator "::".
    This is used to access files inside a zip file over http for example.

    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.
    Then you can just chain the url this way:

        zip://folder1/file.txt::https://host.com/archive.zip

    The xjoin function allows you to apply the join on the first path of the chain.

    Example::

        >>> xjoin("zip://folder1::https://host.com/archive.zip", "file.txt")
        zip://folder1/file.txt::https://host.com/archive.zip
    ::N)strsplitr   ospathjoin	posixpath)apbr,   r,   r-   xjoinU   s
    rF   c                 C   s\   t | d^} }t| r0tjt|  } n
t| } | 	drL| d7 } d
| g| S )u#  
    This function extends os.path.dirname to support the "::" hop separator. It supports both paths and urls.

    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator "::".
    This is used to access files inside a zip file over http for example.

    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.
    Then you can just chain the url this way:

        zip://folder1/file.txt::https://host.com/archive.zip

    The xdirname function allows you to apply the dirname on the first path of the chain.

    Example::

        >>> xdirname("zip://folder1/file.txt::https://host.com/archive.zip")
        zip://folder1::https://host.com/archive.zip
    r<   ://)r=   r>   r   r?   r@   dirnamer   as_posixrB   endswithrA   rC   rE   r,   r,   r-   xdirnamep   s    

rM   )urlpathdownload_configc                 C   sV   t | d^}}t|r&tj|S t| |d\} }tj| |d^}}||S dS )a  Extend `os.path.exists` function to support both local and remote files.

    Args:
        urlpath (`str`): URL path.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `bool`
    r<   rO   storage_optionsN)	_as_strr>   r   r?   r@   exists!_prepare_path_and_storage_optionsfsspecget_fs_token_paths)rN   rO   main_hop	rest_hopsrR   fs_r,   r,   r-   xexists   s    r\   c                 C   s<   t | d^} }t| r.tjt|  S t| S dS )u  
    This function extends os.path.basename to support the "::" hop separator. It supports both paths and urls.

    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator "::".
    This is used to access files inside a zip file over http for example.

    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.
    Then you can just chain the url this way:

        zip://folder1/file.txt::https://host.com/archive.zip

    The xbasename function allows you to apply the basename on the first path of the chain.

    Example::

        >>> xbasename("zip://folder1/file.txt::https://host.com/archive.zip")
        file.txt
    r<   N)	r=   r>   r   r?   r@   basenamer   rJ   rB   rL   r,   r,   r-   	xbasename   s    r^   c                 C   sf   t | d^} }t| r.tjt|  S t| \} }d| 	drR| d n| g| |fS dS )u,  
    This function extends os.path.split to support the "::" hop separator. It supports both paths and urls.

    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator "::".
    This is used to access files inside a zip file over http for example.

    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.
    Then you can just chain the url this way:

        zip://folder1/file.txt::https://host.com/archive.zip

    The xsplit function allows you to apply the xsplit on the first path of the chain.

    Example::

        >>> xsplit("zip://folder1/file.txt::https://host.com/archive.zip")
        ('zip://folder1::https://host.com/archive.zip', 'file.txt')
    r<   rG   rH   N)
r=   r>   r   r?   r@   r   rJ   rB   rA   rK   )rC   rE   tailr,   r,   r-   xsplit   s
    r`   c                 C   sT   t | d^} }t| r.tjt|  S t| \} }d	| g| |fS dS )u8  
    This function extends os.path.splitext to support the "::" hop separator. It supports both paths and urls.

    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator "::".
    This is used to access files inside a zip file over http for example.

    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.
    Then you can just chain the url this way:

        zip://folder1/file.txt::https://host.com/archive.zip

    The xsplitext function allows you to apply the splitext on the first path of the chain.

    Example::

        >>> xsplitext("zip://folder1/file.txt::https://host.com/archive.zip")
        ('zip://folder1/file::https://host.com/archive.zip', '.txt')
    r<   N)
r=   r>   r   r?   r@   splitextr   rJ   rB   rA   )rC   rE   extr,   r,   r-   	xsplitext   s
    rc   rO   returnc                 C   sV   t | d^}}t|r&tj| S t| |d\} }tj| |d^}}||S dS )zExtend `os.path.isfile` function to support remote files.

    Args:
        path (`str`): URL path.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `bool`
    r<   rP   rQ   N)	r=   r>   r   r?   r@   isfilerU   rV   rW   )r@   rO   rX   rY   rR   rZ   r[   r,   r,   r-   xisfile   s    
rg   c           	   	   C   s   t | d^}}t|r&tj| S t| |d\} }tj| |d^}}|	|}|dkr~t
| |d}t| }W 5 Q R X |S dS )zExtend `os.path.getsize` function to support remote files.

    Args:
        path (`str`): URL path.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `int`: optional
    r<   rP   rQ   N)r=   r>   r   r?   r@   getsizerU   rV   rW   sizexopenr5   read)	r@   rO   rX   rY   rR   rZ   r[   ri   fr,   r,   r-   xgetsize  s    

rm   c                 C   sr   t | d^}}t|r&tj| S t| |d\} }tj| |d^}}|dd }|	dsddS ||S dS )	zExtend `os.path.isdir` function to support remote files.

    Args:
        path (`str`): URL path.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `bool`
    r<   rP   rQ   ://r   /TN)
r=   r>   r   r?   r@   isdirrU   rV   rW   strip)r@   rO   rX   rY   rR   rZ   r[   
inner_pathr,   r,   r-   xisdir  s    

rs   c                 C   sj   t | d^}}t|r:|r.tjj||dS tj|S |rZtj|t |dd dS tj|S dS )zExtend `os.path.relpath` function to support remote files.

    Args:
        path (`str`): URL path.
        start (`str`): Start URL directory path.

    Returns:
        `str`
    r<   )startr   N)r=   r>   r   r?   r@   relpathrB   )r@   rt   rX   rY   r,   r,   r-   xrelpath4  s    
 rv   c                    s$   | j tj  fdd}|| _ d S )Nc                     s   d }t d d D ]p}z| |}W  qW q ttfk
r } z4|}tdtj d| d  d ttj W 5 d }~X Y qX qt	d||S )Nr   z4Got disconnected from remote data host. Retrying in zsec [ro   ]zServer Disconnected)
ranger   r   loggerwarningr   ZSTREAMING_READ_RETRY_INTERVALtimesleepConnectionError)argskwargsZdisconnect_errretryouterrmax_retriesrk   r,   r-   read_with_retriesI  s    

 
z?_add_retries_to_file_obj_read_method.<locals>.read_with_retries)rk   r   ZSTREAMING_READ_MAX_RETRIES)file_objr   r,   r   r-   $_add_retries_to_file_obj_read_methodE  s    r   )r@   re   c                 C   s*   |  dd }dD ]}| |d }q|S )Nr&   z?-_r   )r>   )r@   r'   Zsymbr,   r,   r-   _get_path_extension\  s    r   re   c              	   C   s   z|  d W n ttjfk
r*   Y dS X | t}|  d ttD ]X}t|dt|  }|dk	rr|  S t	|dt|  }|dk	rHt
d| dqHdS )zQread the magic number from a file-like object and return the compression protocolr   NzCompression protocol 'z' not implemented.)seekAttributeErrorioUnsupportedOperationrk   MAGIC_NUMBER_MAX_LENGTHrx   $MAGIC_NUMBER_TO_COMPRESSION_PROTOCOLget0MAGIC_NUMBER_TO_UNSUPPORTED_COMPRESSION_PROTOCOLNotImplementedError)rl   r6   icompressionr,   r,   r-   *_get_extraction_protocol_with_magic_numberf  s    

r   rN   rO   re   c              
   C   s   t | } | dd }t|}|tks8|dks8|dr<d S |tkrLt| S t| |d\} }z4tj| f|pli }t	|W  5 Q R  W S Q R X W n2 t
k
r   | tjrt
| d d n Y nX d S )Nr<   r   tgztarz.tar.gzz.tar.bz2z.tar.xzrP   S
If the repo is private or gated, make sure to log in with `huggingface-cli login`.)r=   r>   r   BASE_KNOWN_EXTENSIONSrK   !COMPRESSION_EXTENSION_TO_PROTOCOLrU   rV   openr   FileNotFoundError
startswithr   HF_ENDPOINT)rN   rO   r@   r'   rR   rl   r,   r,   r-   _get_extraction_protocolx  s0     r   c                 C   sJ   g }i }|  dD ](}t||d\}}|| || qd||fS )Nr<   rP   )r>   ,_prepare_single_hop_path_and_storage_optionsappendupdaterA   )rN   rO   Zprepared_urlpathZprepared_storage_optionsZhoprR   r,   r,   r-   rU     s    
rU   c           	      C   s  |dkrdn|j }d| kr(| dd nd}|dk	rJ||jkrJ|j| }n,|dk	rr||jkrrdd |j D }ni }|r||i}|dkrTt| |dd	t id
did||i ||< d| krt| }d}|j D ]<\}}|	dr| d| 7 } |j}d|i||i ||< qd| kr6d| kr6| d7 } | 	drzd|| d d< n&|dkrz|t
jd||i ||< | |fS )a\  
    Prepare the URL and the kwargs that must be passed to the HttpFileSystem or to requests.get/head

    In particular it resolves google drive URLs
    It also adds the authentication headers for the Hugging Face Hub, for both https:// and hf:// paths.

    Storage options are formatted in the form {protocol: storage_options_for_protocol}
    Nrn   r   filec                 S   s"   i | ]\}}|t  kr||qS r,   )rV   Zavailable_protocols)r+   Zoption_nameZoption_valuer,   r,   r-   r.     s    z@_prepare_single_hop_path_and_storage_options.<locals>.<dictcomp>)httphttps)tokenz
user-agent	trust_envT)headersZclient_kwargszdrive.google.comZdownload_warningz	&confirm=cookieszconfirm=z
&confirm=tz"https://raw.githubusercontent.com/identityr   zAccept-EncodingZhf)r   Zendpoint)r   r>   rR   itemsr   r   r   r   r   r   r   r   )	rN   rO   r   r)   rR   responser   kvr,   r,   r-   r     sN    

 




r   rrP   )r   rO   c             
   O   s   t | }|d^}}t|r0t||f||S t||d\} }||pHi }z"tj| f|d|i| }	W nl tk
r }
 zt|
dkrtd|
n W 5 d}
~
X Y n2 t	k
r   | 
tjrt	| d dn Y nX t|	 |	S )a  Extend `open` function to support remote files using `fsspec`.

    It also has a retry mechanism in case connection fails.
    The `args` and `kwargs` are passed to `fsspec.open`, except `token` which is used for queries to private repos on huggingface.co

    Args:
        file (`str`): Path name of the file to be opened.
        mode (`str`, *optional*, default "r"): Mode in which the file is opened.
        *args: Arguments to be passed to `fsspec.open`.
        download_config : mainly use token or storage_options to support different platforms and auth types.
        **kwargs: Keyword arguments to be passed to `fsspec.open`.

    Returns:
        file object
    r<   rP   modezCannot seek streaming HTTP filezStreaming is not possible for this dataset because data host server doesn't support HTTP range requests. You can still load this dataset in non-streaming mode by passing `streaming=False` (default)Nr   )rS   r>   r   r   rU   rV   
ValueErrorr=   r8   r   r   r   r   r   )r   r   rO   r~   r   Zfile_strrX   rY   rR   r   er,   r,   r-   rj     s4    "rj   )r@   rO   re   c           	      C   s   t | d^}}t|r$t| S t| |d\} }tj| |d^}}|dd }|drv|	|svt
d|  ||}dd	 |D S d
S )zExtend `os.listdir` function to support remote files.

    Args:
        path (`str`): URL path.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `list` of `str`
    r<   rP   rQ   rn   r   ro   zDirectory doesn't exist: c                 S   s   g | ]}t j|d  qS )name)r?   r@   r]   )r+   objr,   r,   r-   
<listcomp>  s     zxlistdir.<locals>.<listcomp>N)rS   r>   r   r?   listdirrU   rV   rW   rq   rp   r   )	r@   rO   rX   rY   rR   rZ   r[   rr   objectsr,   r,   r-   xlistdir  s    


r   F)	recursiverO   c          	         s   t | d^}t|r(tj||dS t| |d\} }tj| |d^}}|dd }||}t|jt	rt|jn|jd   fdd	|D S d
S )a  Extend `glob.glob` function to support remote files.

    Args:
        urlpath (`str`): URL path with shell-style wildcard patterns.
        recursive (`bool`, default `False`): Whether to match the "**" pattern recursively to zero or more
            directories or subdirectories.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `list` of `str`
    r<   )r   rP   rQ   rn   r   r   c                    s&   g | ]}d    d| g qS )r<   rn   )rA   )r+   globbed_pathr)   rY   r,   r-   r   6  s     zxglob.<locals>.<listcomp>N)
rS   r>   r   globrU   rV   rW   
isinstancer)   r=   )	rN   r   rO   rX   rR   rZ   r[   rr   globbed_pathsr,   r   r-   xglob  s    
r   c                 k   s   t | d^}}t|r0tj|f|E dH  nt| |d\} }tj| |d^}}|dd }|drx|	|sxg S t
|jtr|jn|jd }	|j|f|D ],\}
}}d|	 d|
 g| ||fV  qdS )	au  Extend `os.walk` function to support remote files.

    Args:
        urlpath (`str`): URL root path.
        download_config : mainly use token or storage_options to support different platforms and auth types.
        **kwargs: Additional keyword arguments forwarded to the underlying filesystem.


    Yields:
        `tuple`: 3-tuple (dirpath, dirnames, filenames).
    r<   NrP   rQ   rn   r   ro   r   )rS   r>   r   r?   walkrU   rV   rW   rq   rp   r   r)   r=   rA   )rN   rO   r   rX   rY   rR   rZ   r[   rr   r)   dirpathdirnames	filenamesr,   r,   r-   xwalk9  s    r   c                       s   e Zd ZdZ fddZdee dddZd ee ddd	Zd
d Z	e
d dddZe
edddZe
edddZe
edddZdd Zeedf d dddZed dddZ fddZ  ZS )!xPathzHExtension of `pathlib.Path` to support both local paths and remote URLs.c                    sV   t   }|d^}}t|r$|S |dd}td|}||drLdnd7 }|S )Nr<   \ro   rn   rG   rH    )super__str__r>   r   replace#SINGLE_SLASH_AFTER_PROTOCOL_PATTERNsubrK   )selfZpath_strrX   rY   Zpath_as_posix	__class__r,   r-   r   W  s    
zxPath.__str__NrP   c                 C   s   t t| |dS )zExtend `pathlib.Path.exists` method to support both local and remote files.

        Args:
            download_config : mainly use token or storage_options to support different platforms and auth types.

        Returns:
            `bool`
        rP   )r\   r=   )r   rO   r,   r,   r-   rT   a  s    	zxPath.existsc                 c   s   |   }|d^}}t|r4t||E dH  n|r~|d }t||d\}}|dd |i}d||f|dd }nd}tjt	|||d^}}	|t	||}
|
D ]*}t
| d|j d| g| V  qdS )a]  Glob function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Args:
            pattern (`str`): Pattern that resulting paths must match.
            download_config : mainly use token or storage_options to support different platforms and auth types.

        Yields:
            [`xPath`]
        r<   Nr   rP   rn   r   rQ   )rJ   r>   r   r   r   rU   rA   rV   rW   rF   typer)   )r   patternrO   Z
posix_pathrX   rY   rN   rR   rZ   r[   r   r   r,   r,   r-   r   l  s    
z
xPath.globc                 K   s   | j d| f|S )zRglob function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Args:
            pattern (`str`): Pattern that resulting paths must match.

        Yields:
            [`xPath`]
        z**/)r   )r   r   r   r,   r,   r-   rglob  s    	zxPath.rglobr   c                 C   s   t | t|  S )zName function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Returns:
            [`xPath`]
        )r   rM   rJ   r   r,   r,   r-   parent  s    zxPath.parentc                 C   s   t |  dd jS )zName function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Returns:
            `str`
        r<   r   )r   rJ   r>   r   r   r,   r,   r-   r     s    z
xPath.namec                 C   s   t |  dd jS )zStem function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Returns:
            `str`
        r<   r   )r   rJ   r>   stemr   r,   r,   r-   r     s    z
xPath.stemc                 C   s   t |  dd jS )zSuffix function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Returns:
            `str`
        r<   r   )r   rJ   r>   suffixr   r,   r,   r-   r     s    zxPath.suffixc                 O   s   t t| f||S )a  Extend :func:`xopen` to support argument of type :obj:`~pathlib.Path`.

        Args:
            **args: Arguments passed to :func:`fsspec.open`.
            **kwargs: Keyword arguments passed to :func:`fsspec.open`.

        Returns:
            `io.FileIO`: File-like object.
        )rj   r=   )r   r~   r   r,   r,   r-   r     s    
z
xPath.open.)rD   re   c                 G   s   t | t|  f| S )zExtend :func:`xjoin` to support argument of type :obj:`~pathlib.Path`.

        Args:
            *p (`tuple` of `str`): Other path components.

        Returns:
            [`xPath`]
        )r   rF   rJ   r   rD   r,   r,   r-   joinpath  s    	zxPath.joinpathc                 C   s
   |  |S r4   )r   r   r,   r,   r-   __truediv__  s    zxPath.__truediv__c                    s`   t | d^}}t|r2t| t t |S t| dt| t|| g| S )Nr<   )	r=   r>   r   r   r   with_suffixrA   r   rJ   )r   r   rX   rY   r   r,   r-   r     s    zxPath.with_suffix)N)N)r9   r:   r;   __doc__r   r   r   rT   r   r   propertyr   r=   r   r   r   r   r   r   r   r   __classcell__r,   r,   r   r-   r   T  s"   
 r   r@   c                 C   s"   t | trt| S ttt| S r4   )r   r   r=   r   r,   r,   r-   rS     s    rS   c                O   sL   dd l }t| dr$|j| f||S t| } |jt| d|df||S d S Nr   rk   rbrP   )r2   hasattrr   r=   rj   )filepath_or_bufferrO   r~   r   r2   r,   r,   r-   
xgzip_open  s
    
r   c                O   sL   dd l }t| dr$|j| f||S t| } |jt| d|df||S d S r   )numpyr   loadr=   rj   )r   rO   r~   r   npr,   r,   r-   xnumpy_load  s
    
r   c                 K   sd   dd l }t| dr |j| f|S t| } |dddkrHt| |d|d< |jt| d|df|S d S )Nr   rk   r   ZinferrP   r   )pandasr   Zread_csvr=   r   r   rj   r   rO   r   pdr,   r,   r-   xpandas_read_csv  s    
r   c              
   K   s   dd l }t| drPz|j| f|W S  tk
rL   |jt|  f| Y S X nXt| } z|jt| d|df|W S  tk
r   |jtt| d|d f| Y S X d S r   )r   r   Z
read_excelr   r   rk   r=   rj   r   r,   r,   r-   xpandas_read_excel   s    
r   c                 K   s@   dd l m} t| dr$|j| f|S |jt| d|df|S d S r   )Zscipy.ior   r   Zloadmatrj   )r   rO   r   sior,   r,   r-   xsio_loadmat  s    
r   c              
   C   sL   t | drtj| |dS t| d|d}tj||dW  5 Q R  S Q R X dS )a  Extend `xml.etree.ElementTree.parse` function to support remote files.

    Args:
        source: File path or file object.
        parser (`XMLParser`, *optional*, default `XMLParser`): Parser instance.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `xml.etree.ElementTree.Element`: Root element of the given source document.
    rk   )parserr   rP   N)r   ETparserj   )sourcer   rO   rl   r,   r,   r-   	xet_parse  s    
r   c              
   K   sT   t | drtjjj| f|S t| d|d }tjjj|f|W  5 Q R  S Q R X dS )a  Extend `xml.dom.minidom.parse` function to support remote files.

    Args:
        filename_or_file (`str` or file): File path or file object.
        download_config : mainly use token or storage_options to support different platforms and auth types.
        **kwargs (optional): Additional keyword arguments passed to `xml.dom.minidom.parse`.

    Returns:
        :obj:`xml.dom.minidom.Document`: Parsed document.
    rk   r   rP   N)r   r%   domZminidomr   rj   )Zfilename_or_filerO   r   rl   r,   r,   r-   xxml_dom_minidom_parse-  s    
r   c                   @   s&   e Zd ZdZedddZdd ZdS )_IterableFromGeneratorzkUtility class to create an iterable from a generator function, in order to reset the generator when needed.)	generatorc                 O   s   || _ || _|| _d S r4   r   r~   r   )r   r   r~   r   r,   r,   r-   __init__B  s    z_IterableFromGenerator.__init__c                 c   s   | j | j| jE d H  d S r4   r   r   r,   r,   r-   __iter__G  s    z_IterableFromGenerator.__iter__N)r9   r:   r;   r   r   r   r   r,   r,   r,   r-   r   ?  s   r   c                   @   s   e Zd ZdZedd Zedd Zeee	ddf ddd	Z
edeee ee	ddf d
ddZed dddZedee d dddZdS )ArchiveIterablezIAn iterable of (path, fileobj) from a TAR archive, used by `iter_archive`c                 c   sf   t j| dd}|D ]L}|j}| s&q|d kr0qtj|drDq||}||fV  g |_	q~d S )Nzr|*)fileobjr   r&   __)
tarfiler   r   isregr?   r@   r]   r   extractfilemembers)rl   streamtarinfo	file_pathr   r,   r,   r-   	_iter_tarN  s    

zArchiveIterable._iter_tarc                 c   s^   t | }| D ]F}|j}| r&q|d kr0qtj|drDq|	|}||fV  qd S )Nr  )
zipfileZipFileinfolistfilenameis_dirr?   r@   r]   r   r   )rl   Zzipfmemberr	  r   r,   r,   r-   	_iter_zip_  s    

zArchiveIterable._iter_zipNr   c                 c   s6   t |}|dkr"| |E d H  n| |E d H  d S )Nr/   )r   r  r
  )clsrl   r   r,   r,   r-   _iter_from_fileobjn  s    z"ArchiveIterable._iter_from_fileobjr   c              	   c   sT   t ||d}t|d|d0}|dkr6| |E d H  n| |E d H  W 5 Q R X d S )NrP   r   r/   )r   rj   r  r
  )r  rN   rO   r   rl   r,   r,   r-   _iter_from_urlpathv  s
    z"ArchiveIterable._iter_from_urlpathc                 C   s   | | j |S r4   )r  )r  r   r,   r,   r-   from_buf  s    zArchiveIterable.from_bufrd   c                 C   s   | | j ||S r4   )r  )r  urlpath_or_bufrO   r,   r,   r-   from_urlpath  s    zArchiveIterable.from_urlpath)N)N)r9   r:   r;   r   staticmethodr
  r  classmethodr
   r   r  r=   r   r   r  r  r  r,   r,   r,   r-   r   K  s$   

  
r   c                   @   s\   e Zd ZdZed	eeee f ee	 e
eddf dddZed
ee	 d dddZdS )FilesIterablez8An iterable of paths from a list of directories or filesN)urlpathsrO   re   c                 c   s   t |ts|g}|D ]}t||dr<t|dr4q|V  qt||drt||dD ]Z\}}}tdd |D |d d < t|drqTt|D ]}|drqt||V  qqTqt	|qd S )NrP   r  c                 S   s   g | ]}| d s|qS )r  )r   )r+   rI   r,   r,   r-   r     s     
 z5FilesIterable._iter_from_urlpaths.<locals>.<listcomp>)
r   listrg   r^   r   rs   r   sortedrF   r   )r  r  rO   rN   r   r   r   r  r,   r,   r-   _iter_from_urlpaths  s"    

z!FilesIterable._iter_from_urlpathsrd   c                 C   s   | | j ||S r4   )r  )r  r  rO   r,   r,   r-   from_urlpaths  s    zFilesIterable.from_urlpaths)N)N)r9   r:   r;   r   r  r   r=   r   r   r   r
   r  r  r,   r,   r,   r-   r    s     r  c                   @   s   e Zd ZdZdZdee ee ee ee dddZe	dd Z
d	d
 ZeedddZdd ZeedddZdd Zeeejf ee dddZeeee f ee dddZdS )StreamingDownloadManagera  
    Download manager that uses the "::" separator to navigate through (possibly remote) compressed archives.
    Contrary to the regular `DownloadManager`, the `download` and `extract` methods don't actually download nor extract
    data, but they rather return the path or url that could be opened using the `xopen` function which extends the
    built-in `open` function to stream data from remote files.
    TN)dataset_namedata_dirrO   	base_pathc                 C   s.   || _ || _|ptjd| _|p&t | _d S )Nr&   )Z_dataset_name	_data_dirr?   r@   abspath
_base_pathr   rO   )r   r!  r"  rO   r#  r,   r,   r-   r     s    z!StreamingDownloadManager.__init__c                 C   s   | j S r4   )r$  r   r,   r,   r-   
manual_dir  s    z#StreamingDownloadManager.manual_dirc                 C   s   t | j|dd}|S )aU  Normalize URL(s) of files to stream data from.
        This is the lazy version of `DownloadManager.download` for streaming.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL(s) of files to stream data from. Each url is a `str`.

        Returns:
            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input url_or_urls.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        ```
        TZ	map_tuple)r   	_downloadr   url_or_urlsr,   r,   r-   download  s    z!StreamingDownloadManager.download)rN   re   c                 C   s    t |}t|rt| j|}|S r4   )r=   r   r   r&  )r   rN   r,   r,   r-   r)    s    z"StreamingDownloadManager._downloadc                 C   s   t | j|dd}|S )a  Add extraction protocol for given url(s) for streaming.

        This is the lazy version of `DownloadManager.extract` for streaming.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL(s) of files to stream data from. Each url is a `str`.

        Returns:
            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input `url_or_urls`.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> extracted_files = dl_manager.extract(downloaded_files)
        ```
        Tr(  )r   _extract)r   r+  r  r,   r,   r-   extract  s    z StreamingDownloadManager.extractc                 C   s   t |}t|| jd}|dd }t|}|dks>|drNtd| d|d krZ|S |tkrtj	
|dd }d|kr|d |d n|}| d	| d| S | d
| S d S )NrP   r<   r   r   r   z+Extraction protocol for TAR archives like 'z' is not implemented in streaming mode. Please use `dl_manager.iter_archive` instead.

Example usage:

	url = dl_manager.download(url)
	tar_archive_iterator = dl_manager.iter_archive(url)

	for filename, file in tar_archive_iterator:
		...r&   rn   z://::)r=   r   rO   r>   r   rK   r   !SINGLE_FILE_COMPRESSION_PROTOCOLSr?   r@   r]   rindex)r   rN   r)   r@   r'   Z
inner_filer,   r,   r-   r-    s    
	z!StreamingDownloadManager._extractc                 C   s   |  | |S )a0  Prepare given `url_or_urls` for streaming (add extraction protocol).

        This is the lazy version of `DownloadManager.download_and_extract` for streaming.

        Is equivalent to:

        ```
        urls = dl_manager.extract(dl_manager.download(url_or_urls))
        ```

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL(s) to stream from data from. Each url is a `str`.

        Returns:
            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input `url_or_urls`.
        )r.  r,  r*  r,   r,   r-   download_and_extract  s    z-StreamingDownloadManager.download_and_extract)r  re   c                 C   s(   t |drt|S tj|| jdS dS )aN  Iterate over files within an archive.

        Args:
            urlpath_or_buf (`str` or `io.BufferedReader`):
                Archive path or archive binary file object.

        Yields:
            `tuple[str, io.BufferedReader]`:
                2-tuple (path_within_archive, file_object).
                File object is opened in binary mode.

        Example:

        ```py
        >>> archive = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> files = dl_manager.iter_archive(archive)
        ```
        rk   rP   N)r   r   r  r  rO   )r   r  r,   r,   r-   iter_archive)  s    

z%StreamingDownloadManager.iter_archive)r  re   c                 C   s   t j|| jdS )a  Iterate over files.

        Args:
            urlpaths (`str` or `list` of `str`):
                Root paths.

        Yields:
            str: File URL path.

        Example:

        ```py
        >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/beans/resolve/main/data/train.zip')
        >>> files = dl_manager.iter_files(files)
        ```
        rP   )r  r  rO   )r   r  r,   r,   r-   
iter_filesB  s    z#StreamingDownloadManager.iter_files)NNNN)r9   r:   r;   r   Zis_streamingr   r=   r   r   r   r'  r,  r)  r.  r-  r1  r   r   BufferedReaderr   r   r2  r   r3  r,   r,   r,   r-   r     s*       
r   )N)N)N)N)N)N)N)N)r   )N)N)N)N)N)NN)N)gr   r   r?   rB   rer  r{   Zxml.dom.minidomr%   r  asyncior   r   	itertoolsr   pathlibr   r   typingr   r   r	   r
   r   r   r   r   r   Z	xml.etreer   r   rV   Zaiohttp.client_exceptionsr   r   r   Zfilesystemsr   Zutils.file_utilsr   r   r   r   r   r   Zutils.loggingr   Zutils.py_utilsr   rO   r   r9   ry   r   r   r/  compiler   bytesfromhexr   r   maxr   	Exceptionr8   rF   rM   r=   r\   r^   r`   rc   boolrg   intrm   rs   rv   r   r   r   r   rU   r   rj   r   r   r   r   r   rS   r   r   r   r   r   r   r   r   r   r  r   r,   r,   r,   r-   <module>   s   ,  
         

    9- 	

	?%