U
    ,:%eS                     @   s   d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d	Zd
ZdZdgdgddgdZeee eeeeef  dddZG dd deZdS )    N)Path)ListTupleUnion)Tensor)Dataset)download_url_to_file)_get_librispeech_metadata)_extract_tarZlibrispeech_finetuningzIhttps://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgzZ@5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342afz1h/0z1h/*Z9h)10minZ1hZ10h)pathfolders
_ext_audioreturnc                    s^   t   g }|D ]8} fdd | d| D }|dd |D 7 }q|jdd d |S )a  Get the file names and the corresponding file paths without `speaker_id`
    and `chapter_id` directories.
    The format of path is like:
        {root}/{_ARCHIVE_NAME}/1h/[0-5]/[clean, other] or
        {root}/{_ARCHIVE_NAME}/9h/[clean, other]

    Args:
        path (Path): Root path to the dataset.
        folders (List[str]): Folders that contain the desired audio files.
        _ext_audio (str): Extension of audio files.

    Returns:
        List[Tuple[str, str]]:
            List of tuples where the first element is the relative path to the audio file.
            The format of relative path is like:
            1h/[0-5]/[clean, other] or 9h/[clean, other]
            The second element is the file name without audio extension.
    c                    s   g | ]}|  qS  )relative_to.0pr   r   e/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/torchaudio/datasets/librilight_limited.py
<listcomp>*   s     z&_get_fileids_paths.<locals>.<listcomp>z/*/*/*/*c                 S   s$   g | ]}t |jjjt |jfqS r   )strparentstemr   r   r   r   r   +   s     c                 S   s   | d | d  S )Nr      r   )xr   r   r   <lambda>,       z$_get_fileids_paths.<locals>.<lambda>)key)r   globsort)r   r   r   Zfiles_pathsfolderpathsr   r   r   _get_fileids_paths   s    "r$   c                   @   sd   e Zd ZdZdZdZdeeef ee	dddd	Z
eeeeeeeef d
ddZedddZdS )LibriLightLimiteda  Subset of Libri-light :cite:`librilight` dataset,
    which was used in HuBERT :cite:`hsu2021hubert` for supervised fine-tuning.

    Args:
        root (str or Path): Path to the directory where the dataset is found or downloaded.
        subset (str, optional): The subset to use. Options: [``"10min"``, ``"1h"``, ``"10h"``]
            (Default: ``"10min"``).
        download (bool, optional):
            Whether to download the dataset if it is not found at root path. (default: ``False``).
    z
.trans.txtz.flacr   FN)rootsubsetdownloadr   c                 C   s   |t kr tdt   d| t | }t|}tj|t| _tj|t d}tj	| js|spt
dtj|stt|td t| t| j|| j| _d S )Nz`subset` must be one of z	. Found: z.tgzz9Dataset not found. Please use `download=True` to download)Zhash_prefix)_SUBSET_MAP
ValueErrorkeysosfspathr   join_ARCHIVE_NAME_pathisdirRuntimeErrorisfiler   _URL	_CHECKSUMr
   r$   r   _fileids_paths)selfr&   r'   r(   r   archiver   r   r   __init__?   s    
zLibriLightLimited.__init__)nr   c                 C   sT   | j | \}}t|| j|| j| j}ttj	| j|d \}}|f|dd  S )a  Load the n-th sample from the dataset.

        Args:
            n (int): The index of the sample to be loaded
        Returns:
            Tuple of the following items;

            Tensor:
                Waveform
            int:
                Sample rate
            str:
                Transcript
            int:
                Speaker ID
            int:
                Chapter ID
            int:
                Utterance ID
        r   r   N)
r6   r	   r0   r   _ext_txt
torchaudioloadr,   r   r.   )r7   r:   	file_pathZfileidmetadataZwaveform_r   r   r   __getitem__T   s    zLibriLightLimited.__getitem__)r   c                 C   s
   t | jS )N)lenr6   )r7   r   r   r   __len__n   s    zLibriLightLimited.__len__)r   F)__name__
__module____qualname____doc__r;   r   r   r   r   boolr9   intr   r   rA   rC   r   r   r   r   r%   0   s     
 r%   )r,   pathlibr   typingr   r   r   r<   Ztorchr   Ztorch.utils.datar   Ztorchaudio._internalr   Ztorchaudio.datasets.librispeechr	   Ztorchaudio.datasets.utilsr
   r/   r4   r5   r)   r   r$   r%   r   r   r   r   <module>   s   $