U
    -e0                     @   s   d dl Z d dlmZmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZ d dlZd dlZddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ erddlmZ d\ZZ Z!eG dd dZ"dS )    N)	dataclassfield)BytesIO)TYPE_CHECKINGAnyClassVarDictOptionalUnion   )config)DownloadConfig)xopen	xsplitext)
array_cast)no_op_if_value_is_nullstring_to_dict   )FeatureType)FFFc                	   @   s0  e Zd ZU dZdZee ed< dZe	ed< dZ
e	ed< dZee ed< dZee ed	< ee e d
Zee ed< ed dddZeed< dd Zeeeef edddZd eeeeeee	df f  edddZedeedf f dddZeejej f ej dddZ!ej ej dddZ"dS )!Audioa1  Audio [`Feature`] to extract audio data from an audio file.

    Input: The Audio feature accepts as input:
    - A `str`: Absolute path to the audio file (i.e. random access is allowed).
    - A `dict` with the keys:

        - `path`: String with relative path of the audio file to the archive file.
        - `bytes`: Bytes content of the audio file.

      This is useful for archived files with sequential access.

    - A `dict` with the keys:

        - `path`: String with relative path of the audio file to the archive file.
        - `array`: Array containing the audio sample
        - `sampling_rate`: Integer corresponding to the sampling rate of the audio sample.

      This is useful for archived files with sequential access.

    Args:
        sampling_rate (`int`, *optional*):
            Target sampling rate. If `None`, the native sampling rate is used.
        mono (`bool`, defaults to `True`):
            Whether to convert the audio signal to mono by averaging samples across
            channels.
        decode (`bool`, defaults to `True`):
            Whether to decode the audio data. If `False`,
            returns the underlying dictionary in the format `{"path": audio_path, "bytes": audio_bytes}`.

    Example:

    ```py
    >>> from datasets import load_dataset, Audio
    >>> ds = load_dataset("PolyAI/minds14", name="en-US", split="train")
    >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000))
    >>> ds[0]["audio"]
    {'array': array([ 2.3443763e-05,  2.1729663e-04,  2.2145823e-04, ...,
         3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32),
     'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
     'sampling_rate': 16000}
    ```
    Nsampling_rateTmonodecodeiddictdtypebytespathpa_typeF)defaultinitrepr_typec                 C   s   | j S N)r   )self r&   X/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/datasets/features/audio.py__call__M   s    zAudio.__call__)valuereturnc              
   C   s  zddl }W n, tk
r8 } ztd|W 5 d}~X Y nX t|trNd|dS t|trb|ddS d|krt }|j||d |d dd | ddS |d	dk	rnt	j
|d	 rn|d	 d
r\|ddkrtd|drtj|d tjdtjd }n tj|d	 dddtjd }tt }|j|||d dd | ddS d|d	dS nF|ddk	s|d	dk	r|d|d	dS td| ddS )zEncode example into a format for Arrow.

        Args:
            value (`str` or `dict`):
                Data passed as input to Audio feature.

        Returns:
            `dict`
        r   Nz;To support encoding audio data, please install 'soundfile'.r   arrayr   Zwav)formatr   ZpcmzBTo use PCM files, please specify a 'sampling_rate' in Audio objectr   )r   i  hr)r   modezUAn audio sample should have one of 'path' or 'bytes' but they are missing or None in .)	soundfileImportError
isinstancestrr   r   writegetvaluegetosr   isfileendswithKeyErrornpZ
frombufferZint16ZastypeZfloat32Zmemmap
ValueError)r%   r)   sferrbufferZbytes_valuer&   r&   r'   encode_exampleP   s8    




"" 
 
zAudio.encode_example)r)   token_per_repo_idr*   c              
   C   s  | j std|d dk	r.|d t|d fn
|d df\}}|dkr^|dkr^td| dzddl}ddl}W n, tk
r } ztd|W 5 d}~X Y nX |dk	rt|d	 d	d  nd}t	j
s|d
krtdnt	js|dkrtd|dkr|pi }|dd }	|	t	jr(t	jnt	j}
zt|	|
d }|| }W n ttfk
rf   d}Y nX t|d}t|d|d}||\}}W 5 Q R X n||\}}|j}| jr||}| jr| j|kr|j||| jd}| j}|||dS )a  Decode example audio file into audio data.

        Args:
            value (`dict`):
                A dictionary with keys:

                - `path`: String with relative audio file path.
                - `bytes`: Bytes of the audio file.
            token_per_repo_id (`dict`, *optional*):
                To access and decode
                audio files from private repositories on the Hub, you can pass
                a dictionary repo_id (`str`) -> token (`bool` or `str`)

        Returns:
            `dict`
        zMDecoding is disabled for this feature. Please use Audio(decode=True) instead.r   Nr   zJAn audio sample should have one of 'path' or 'bytes' but both are None in r0   r   zJTo support decoding audio files, please install 'librosa' and 'soundfile'.r   ZopuszDecoding 'opus' files requires system library 'libsndfile'>=1.0.31, You can try to update `soundfile` python library: `pip install "soundfile>=0.12.1"`. Zmp3zDecoding 'mp3' files requires system library 'libsndfile'>=1.1.0, You can try to update `soundfile` python library: `pip install "soundfile>=0.12.1"`. z::repo_id)tokenrb)download_config)Zorig_srZ	target_sr)r   r+   r   )r   RuntimeErrorr   r=   librosar1   r2   r   lowerr   ZIS_OPUS_SUPPORTEDZIS_MP3_SUPPORTEDsplit
startswithZHF_ENDPOINTZHUB_DATASETS_URLZHUB_DATASETS_HFFS_URLr   r;   r   r   readTr   Zto_monor   Zresample)r%   r)   rB   r   filerI   r>   r?   Zaudio_format
source_urlpatternrD   rE   rG   fr+   r   r&   r&   r'   decode_example   sP    0$




zAudio.decode_exampler   )r*   c                 C   s,   ddl m} | jrtd|d|ddS )z[If in the decodable state, raise an error, otherwise flatten the feature into a dictionary.r   )Valuez'Cannot flatten a decoded Audio feature.binarystringr   )featuresrT   r   r=   )r%   rT   r&   r&   r'   flatten   s    zAudio.flatten)storager*   c                 C   sv  t j|jrLt jdgt| t  d}t jj||gddg|	 d}nt j
|jrt jdgt| t  d}t jj||gddg|	 d}nt j|jr|jdrt dd | D }nt j|jrj|jdd	kr|d}nt jdgt| t  d}|jdd	kr0|d}nt jdgt| t  d}t jj||gddg|	 d}t|| jS )
a  Cast an Arrow array to the Audio arrow storage type.
        The Arrow types that can be converted to the Audio pyarrow storage type are:

        - `pa.string()` - it must contain the "path" data
        - `pa.binary()` - it must contain the audio bytes
        - `pa.struct({"bytes": pa.binary()})`
        - `pa.struct({"path": pa.string()})`
        - `pa.struct({"bytes": pa.binary(), "path": pa.string()})`  - order doesn't matter

        Args:
            storage (`Union[pa.StringArray, pa.StructArray]`):
                PyArrow array to cast.

        Returns:
            `pa.StructArray`: Array in the Audio arrow storage type, that is
                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`
        Ntyper   r   maskr+   c                 S   s$   g | ]}|d k	rt  |nd qS r$   )r   rA   .0xr&   r&   r'   
<listcomp>   s     z&Audio.cast_storage.<locals>.<listcomp>r   )patypes	is_stringr[   r+   lenrU   StructArrayfrom_arraysis_nullZ	is_binaryrV   Z	is_structZget_all_field_indices	to_pylistZget_field_indexr   r   r   r%   rY   Zbytes_arrayZ
path_arrayr&   r&   r'   cast_storage   s"    " zAudio.cast_storagec                    s   t dd  tj fdd| D t d}tjdd |d D t d}tjj||gddg|	 d	}t
|| jS )
a8  Embed audio files into the Arrow array.

        Args:
            storage (`pa.StructArray`):
                PyArrow array to embed.

        Returns:
            `pa.StructArray`: Array in the Audio arrow storage type, that is
                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
        c              	   S   s"   t | d}| }W 5 Q R X |S )NrF   )r   rM   )r   rR   bytes_r&   r&   r'   path_to_bytes  s    z*Audio.embed_storage.<locals>.path_to_bytesc                    s8   g | ]0}|d k	r0|d d kr( |d q2|d nd qS )Nr   r   r&   r^   rm   r&   r'   ra     s   z'Audio.embed_storage.<locals>.<listcomp>rZ   c                 S   s$   g | ]}|d k	rt j|nd qS r$   )r8   r   basename)r_   r   r&   r&   r'   ra     s     r   r   r\   )r   rb   r+   ri   rU   r   rV   rf   rg   rh   r   r   rj   r&   rn   r'   embed_storage   s    

zAudio.embed_storage)N)#__name__
__module____qualname____doc__r   r	   int__annotations__r   boolr   r   r4   r   r   rb   structrU   rV   r   r   r   r#   r(   r
   r   r   rA   r   rS   rX   ZStringArrayrf   rk   rp   r&   r&   r&   r'   r      s&   
+$2  H&r   )#r8   dataclassesr   r   ior   typingr   r   r   r   r	   r
   numpyr<   Zpyarrowrb    r   Zdownload.download_configr   Z#download.streaming_download_managerr   r   tabler   Zutils.py_utilsr   r   rW   r   Z_ffmpeg_warnedZ_librosa_warnedZ_audioread_warnedr   r&   r&   r&   r'   <module>   s    
