U
    -e                     @   sl	  d dl mZ d dlmZmZmZmZ d dlmZ ddl	m
Z
mZ g ZeG dd dZeG dd	 d	eZed
ddddddddgdddddddddddddddddddZde_edddddddddgddddddddddddddde dddd Zd!e_ed"ddddddddgddddddddddddddde dddd#Zd$e_ed%ddddddddgddddddddddddddde dddd#Zd&e_ed'ddddddddgdd(dddd)ddd*dddd+dddddd,Zd-e_ed.ddddddddgdd(dddd)ddd*dddd+dde dddd#Zd/e_ed0ddddddddgdd(dddd)ddd*dddd+dde dddd#Zd1e_ed2ddddddddgdd(dddd)ddd*dddd+dde dddd#Zd3e_ed4d5dddddddgd6d(dddd)ddd*ddd6ddddd6dd,Zd7e_ed8d5dddddddgd6d(dddd)ddd*ddd6ddde dd6dd#Zd9e_ed:d5dddddddgd6d(dddd)ddd*ddd6ddde dd6dd#Zd;e_ed<d5dddddddgd6d(dddd)ddd*ddd6ddde dd6dd#Zd=e_ed>d5dddddddgd6d(dddd)ddd*ddd6ddddd6dd,Zd?e_ed@ddddddddgddddddddddddddddddd,ZdAe_edBd5dddddddgdd(dddd)ddd*ddd6ddddd6dd,ZdCe_edDd5dddddddgddEddddFdddGddd6ddddd6dd,Z dHe _edId5dddddddgdd(dddd)ddd*ddd6ddde dd6dd#Z!dJe!_edKd5dddddddgddEddddFdddGddd6ddde dd6dd#Z"dLe"_edMddddddddgddddddddddddddNde# dddOddPZ$dQe$_edRddddddddgddddddddddddddSde% dddTddPZ&dUe&_edVddddddddgddddddddddddddWde' dddOddPZ(dXe(_edYddddddddgddddddddddddddZde) dddd#Z*d[e*_ed\ddddddddgdddddddddddddd]de+ ddd^ddPZ,d_e,_ed`ddddddddgddddddddadbddddddddcdddddeZ-dfe-_edgddddddddgddddddddadbddddddddcdddddeZ.dhe._edid5dddddddgdd(dddd)ddadbdd*ddddddcdddd6deZ/dje/_edkd5dddddddgd6d(dddd)ddd*ddd6dddddd6deZ0dle0_edmd5dddddddgd6dEddddFdddGddd6dddddd6deZ1dne1_edod5dddddddgd6dpddddFdddqddd6dddddd6deZ2dre2_eG dsdt dteZ3e3dud5dddddddgd6d(dddd)ddd*ddd6ddSde4 dd6dd#Z5dve5_dS )w    )	dataclass)AnyDictOptionalTuple)Module   )alignerutilsc                   @   sp   e Zd ZU dZeed< eeef ed< eed< e	ed< eed< e
eddd	Zd
d ZddedddZdS )Wav2Vec2Bundleu  Data class that bundles associated information to use pretrained :py:class:`~torchaudio.models.Wav2Vec2Model`.

    This class provides interfaces for instantiating the pretrained model along with
    the information necessary to retrieve pretrained weights and additional data
    to be used with the model.

    Torchaudio library instantiates objects of this class, each of which represents
    a different pretrained model. Client code should access pretrained models via these
    instances.

    Please see below for the usage and the available values.

    Example - Feature Extraction
        >>> import torchaudio
        >>>
        >>> bundle = torchaudio.pipelines.HUBERT_BASE
        >>>
        >>> # Build the model and load pretrained weight.
        >>> model = bundle.get_model()
        Downloading:
        100%|███████████████████████████████| 360M/360M [00:06<00:00, 60.6MB/s]
        >>>
        >>> # Resample audio to the expected sampling rate
        >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
        >>>
        >>> # Extract acoustic features
        >>> features, _ = model.extract_features(waveform)
    _path_params_sample_rate_normalize_waveform_model_typereturnc                 C   s   | j S )zUSample rate of the audio that the model is trained on.

        :type: float
        )r   self r   d/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/torchaudio/pipelines/_wav2vec2/impl.pysample_rate1   s    zWav2Vec2Bundle.sample_ratec                 C   s   t | j|S N)r
   _get_state_dictr   r   	dl_kwargsr   r   r   r   9   s    zWav2Vec2Bundle._get_state_dictNr   c                C   sD   t | j| j}| |}|| | jr8t j|dd}|  |S )a  Construct the model and load the pretrained weight.

        The weight file is downloaded from the internet and cached with
        :func:`torch.hub.load_state_dict_from_url`

        Args:
            dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.

        Returns:
            Variation of :py:class:`~torchaudio.models.Wav2Vec2Model`.

            For the models listed below, an additional layer normalization is performed on the input.

            For all other models, a :py:class:`~torchaudio.models.Wav2Vec2Model` instance is returned.

            - WAV2VEC2_LARGE_LV60K
            - WAV2VEC2_ASR_LARGE_LV60K_10M
            - WAV2VEC2_ASR_LARGE_LV60K_100H
            - WAV2VEC2_ASR_LARGE_LV60K_960H
            - WAV2VEC2_XLSR53
            - WAV2VEC2_XLSR_300M
            - WAV2VEC2_XLSR_1B
            - WAV2VEC2_XLSR_2B
            - HUBERT_LARGE
            - HUBERT_XLARGE
            - HUBERT_ASR_LARGE
            - HUBERT_ASR_XLARGE
            - WAVLM_LARGE
        T)normalize_waveform)	r
   
_get_modelr   r   r   load_state_dictr   _extend_modeleval)r   r   model
state_dictr   r   r   	get_model=   s    

zWav2Vec2Bundle.get_model)__name__
__module____qualname____doc__str__annotations__r   r   floatboolpropertyr   r   r   r$   r   r   r   r   r      s   
r   c                   @   s\   e Zd ZU dZeedf ed< dZeedf ed< ddeeedf dd	d
Z	dd Z
dS )Wav2Vec2ASRBundleu  Data class that bundles associated information to use pretrained
    :py:class:`~torchaudio.models.Wav2Vec2Model`.

    This class provides interfaces for instantiating the pretrained model along with
    the information necessary to retrieve pretrained weights and additional data
    to be used with the model.

    Torchaudio library instantiates objects of this class, each of which represents
    a different pretrained model. Client code should access pretrained models via these
    instances.

    Please see below for the usage and the available values.

    Example - ASR
        >>> import torchaudio
        >>>
        >>> bundle = torchaudio.pipelines.HUBERT_ASR_LARGE
        >>>
        >>> # Build the model and load pretrained weight.
        >>> model = bundle.get_model()
        Downloading:
        100%|███████████████████████████████| 1.18G/1.18G [00:17<00:00, 73.8MB/s]
        >>>
        >>> # Check the corresponding labels of the output.
        >>> labels = bundle.get_labels()
        >>> print(labels)
        ('-', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')
        >>>
        >>> # Resample audio to the expected sampling rate
        >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
        >>>
        >>> # Infer the label probability distribution
        >>> emissions, _ = model(waveform)
        >>>
        >>> # Pass emission to decoder
        >>> # `ctc_decode` is for illustration purpose only
        >>> transcripts = ctc_decode(emissions, labels)
    ._labelsr         _remove_aux_axis-blank)r6   r   c                C   s   |f| j S )au  The output class labels.

        The first is blank token, and it is customizable.

        Args:
            blank (str, optional): Blank token. (default: ``'-'``)

        Returns:
            Tuple[str, ...]:
            For models fine-tuned on ASR, returns the tuple of strings representing
            the output class labels.

        Example
            >>> from torchaudio.pipelines import HUBERT_ASR_LARGE as bundle
            >>> bundle.get_labels()
            ('-', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')
        )r/   )r   r6   r   r   r   
get_labels   s    zWav2Vec2ASRBundle.get_labelsc                 C   s   t | j|| jS r   )r
   r   r   r3   r   r   r   r   r      s    z!Wav2Vec2ASRBundle._get_state_dictN)r%   r&   r'   r(   r   r)   r*   r3   intr7   r   r   r   r   r   r.   d   s   
'
r.   zwav2vec2_fairseq_base_ls960.pthZ
group_norm)   
      )r9   r2   r1   )r9   r1   r1   Fi   g?         i   g        g?N)extractor_modeextractor_conv_layer_configextractor_conv_biasencoder_embed_dimencoder_projection_dropoutencoder_pos_conv_kernelencoder_pos_conv_groupsencoder_num_layersencoder_num_headsencoder_attention_dropoutencoder_ff_interm_featuresencoder_ff_interm_dropoutencoder_dropoutencoder_layer_norm_firstencoder_layer_dropaux_num_outi>  ZWav2Vec2)r   r   r   r   r   a  Wav2vec 2.0 model ("base" architecture),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned.

Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
z)wav2vec2_fairseq_base_ls960_asr_ll10m.pth   )r   r   r/   r   r   r   a9  Wav2vec 2.0 model ("base" architecture with an extra linear module),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset
:cite:`librilight` ("train-10min" subset).

Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
z)wav2vec2_fairseq_base_ls960_asr_ls100.pth)r/   r   r   r   a  Wav2vec 2.0 model ("base" architecture with an extra linear module),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on 100 hours of transcribed audio from "train-clean-100" subset.

Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
z)wav2vec2_fairseq_base_ls960_asr_ls960.ptha  Wav2vec 2.0 model ("base" architecture with an extra linear module),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on the same audio with the corresponding transcripts.

Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
z wav2vec2_fairseq_large_ls960.pthi      i   g?)r   r   r   a  Wav2vec 2.0 model ("large" architecture),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned.

Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
z*wav2vec2_fairseq_large_ls960_asr_ll10m.ptha:  Wav2vec 2.0 model ("large" architecture with an extra linear module),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset
:cite:`librilight` ("train-10min" subset).

Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
z*wav2vec2_fairseq_large_ls960_asr_ls100.ptha%  Wav2vec 2.0 model ("large" architecture with an extra linear module),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on 100 hours of transcribed audio from
the same dataset ("train-clean-100" subset).

Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
z*wav2vec2_fairseq_large_ls960_asr_ls960.ptha  Wav2vec 2.0 model ("large" architecture with an extra linear module),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on the same audio with the corresponding transcripts.

Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
z wav2vec2_fairseq_large_lv60k.pthZ
layer_normTaf  Wav2vec 2.0 model ("large-lv60k" architecture),
pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`,
not fine-tuned.

Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
z*wav2vec2_fairseq_large_lv60k_asr_ll10m.ptha  Wav2vec 2.0 model ("large-lv60k" architecture with an extra linear module),
pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`, and
fine-tuned for ASR on 10 minutes of transcribed audio from the same dataset ("train-10min" subset).

Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
z*wav2vec2_fairseq_large_lv60k_asr_ls100.ptha  Wav2vec 2.0 model ("large-lv60k" architecture with an extra linear module),
pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`, and
fine-tuned for ASR on 100 hours of transcribed audio from
*LibriSpeech* dataset :cite:`7178964` ("train-clean-100" subset).

Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
z*wav2vec2_fairseq_large_lv60k_asr_ls960.ptha+  Wav2vec 2.0 model ("large-lv60k" architecture with an extra linear module),
pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* :cite:`librilight` dataset, and
fine-tuned for ASR on 960 hours of transcribed audio from *LibriSpeech* dataset :cite:`7178964`
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").

Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
z!wav2vec2_fairseq_large_xlsr53.ptha  Wav2vec 2.0 model ("base" architecture),
pre-trained on 56,000 hours of unlabeled audio from multiple datasets (
*Multilingual LibriSpeech* :cite:`Pratap_2020`,
*CommonVoice* :cite:`ardila2020common` and
*BABEL* :cite:`Gales2014SpeechRA`),
not fine-tuned.

Originally published by the authors of
*Unsupervised Cross-lingual Representation Learning for Speech Recognition*
:cite:`conneau2020unsupervised` under MIT License and redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
zhubert_fairseq_base_ls960.ptha  HuBERT model ("base" architecture),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned.

Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
zhubert_fairseq_large_ll60k.pthac  HuBERT model ("large" architecture),
pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`,
not fine-tuned.

Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
zhubert_fairseq_xlarge_ll60k.pthi   0   i   ai  HuBERT model ("extra large" architecture),
pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`,
not fine-tuned.

Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
z(hubert_fairseq_large_ll60k_asr_ls960.ptha  HuBERT model ("large" architecture),
pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`, and
fine-tuned for ASR on 960 hours of transcribed audio from *LibriSpeech* dataset :cite:`7178964`
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").

Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
z)hubert_fairseq_xlarge_ll60k_asr_ls960.ptha  HuBERT model ("extra large" architecture),
pre-trained on 60,000 hours of unlabeled audio from
*Libri-Light* dataset :cite:`librilight`, and
fine-tuned for ASR on 960 hours of transcribed audio from
*LibriSpeech* dataset :cite:`7178964`
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").

Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
z%wav2vec2_voxpopuli_base_10k_asr_de.pt    )r   r1   r2   #   )r/   r   r   r3   r   a  wav2vec 2.0 model ("base" architecture),
pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
("10k" subset, consisting of 23 languages), and
fine-tuned for ASR on 282 hours of transcribed audio from "de" subset.

Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
redistributed with the same license.
[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
z%wav2vec2_voxpopuli_base_10k_asr_en.pt   )r   r1   r2      a  wav2vec 2.0 model ("base" architecture),
pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
("10k" subset, consisting of 23 languages), and
fine-tuned for ASR on 543 hours of transcribed audio from "en" subset.

Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
redistributed with the same license.
[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
z%wav2vec2_voxpopuli_base_10k_asr_es.ptrS   a  wav2vec 2.0 model ("base" architecture),
pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
("10k" subset, consisting of 23 languages), and
fine-tuned for ASR on 166 hours of transcribed audio from "es" subset.

Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
redistributed with the same license.
[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
z%wav2vec2_voxpopuli_base_10k_asr_fr.pt+   a  wav2vec 2.0 model ("base" architecture),
pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
("10k" subset, consisting of 23 languages), and
fine-tuned for ASR on 211 hours of transcribed audio from "fr" subset.

Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
redistributed with the same license.
[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
z%wav2vec2_voxpopuli_base_10k_asr_it.pt%   r0   a  wav2vec 2.0 model ("base" architecture),
pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
("10k" subset, consisting of 23 languages), and
fine-tuned for ASR on 91 hours of transcribed audio from "it" subset.

Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
redistributed with the same license.
[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
zwavlm_base.pthi   i@  )r?   r@   rA   rB   rC   rD   rE   rF   rG   Zencoder_max_distanceZencoder_num_bucketsrH   rI   rJ   rK   rL   rM   rN   ZWavLM)r   r   r   aB  WavLM Base model ("base" architecture),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`, not fine-tuned.

Originally published by the authors of *WavLM* :cite:`chen2022wavlm` under MIT License and
redistributed with the same license.
[`License <https://github.com/microsoft/unilm/blob/65f15af2a307ebb64cfb25adf54375b002e6fe8d/LICENSE>`__,
`Source <https://github.com/microsoft/unilm/tree/65f15af2a307ebb64cfb25adf54375b002e6fe8d/wavlm#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
zwavlm_base_plus.ptha  WavLM Base+ model ("base" architecture),
pre-trained on 60,000 hours of Libri-Light dataset :cite:`librilight`, 10,000 hours of GigaSpeech :cite:`GigaSpeech2021`,
and 24,000 hours of *VoxPopuli* :cite:`voxpopuli`, not fine-tuned.

Originally published by the authors of *WavLM* :cite:`chen2022wavlm` under MIT License and
redistributed with the same license.
[`License <https://github.com/microsoft/unilm/blob/65f15af2a307ebb64cfb25adf54375b002e6fe8d/LICENSE>`__,
`Source <https://github.com/microsoft/unilm/tree/65f15af2a307ebb64cfb25adf54375b002e6fe8d/wavlm#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
zwavlm_large.ptha  WavLM Large model ("large" architecture),
pre-trained on 60,000 hours of Libri-Light dataset :cite:`librilight`, 10,000 hours of GigaSpeech :cite:`GigaSpeech2021`,
and 24,000 hours of *VoxPopuli* :cite:`voxpopuli`, not fine-tuned.

Originally published by the authors of *WavLM* :cite:`chen2022wavlm` under MIT License and
redistributed with the same license.
[`License <https://github.com/microsoft/unilm/blob/65f15af2a307ebb64cfb25adf54375b002e6fe8d/LICENSE>`__,
`Source <https://github.com/microsoft/unilm/tree/65f15af2a307ebb64cfb25adf54375b002e6fe8d/wavlm#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
zwav2vec2_xlsr_300m.ptha(  XLS-R model with 300 million parameters,
pre-trained on 436,000 hours of unlabeled audio from multiple datasets (
*Multilingual LibriSpeech* :cite:`Pratap_2020`,
*CommonVoice* :cite:`ardila2020common`,
*VoxLingua107* :cite:`valk2021voxlingua107`,
*BABEL* :cite:`Gales2014SpeechRA`, and
*VoxPopuli* :cite:`voxpopuli`) in 128 languages,
not fine-tuned.

Originally published by the authors of *XLS-R* :cite:`babu2021xls` under MIT License and
redistributed with the same license.
[`License <https://github.com/facebookresearch/fairseq/blob/30c912b73c0f88d41171879b2f03226a171004ef/LICENSE>`__,
`Source <https://github.com/facebookresearch/fairseq/tree/30c912b73c0f88d41171879b2f03226a171004ef/examples/wav2vec/xlsr#xls-r>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for usage details.
zwav2vec2_xlsr_1b.ptha&  XLS-R model with 1 billion parameters,
pre-trained on 436,000 hours of unlabeled audio from multiple datasets (
*Multilingual LibriSpeech* :cite:`Pratap_2020`,
*CommonVoice* :cite:`ardila2020common`,
*VoxLingua107* :cite:`valk2021voxlingua107`,
*BABEL* :cite:`Gales2014SpeechRA`, and
*VoxPopuli* :cite:`voxpopuli`) in 128 languages,
not fine-tuned.

Originally published by the authors of *XLS-R* :cite:`babu2021xls` under MIT License and
redistributed with the same license.
[`License <https://github.com/facebookresearch/fairseq/blob/30c912b73c0f88d41171879b2f03226a171004ef/LICENSE>`__,
`Source <https://github.com/facebookresearch/fairseq/tree/30c912b73c0f88d41171879b2f03226a171004ef/examples/wav2vec/xlsr#xls-r>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for usage details.
zwav2vec2_xlsr_2b.pthi  i   a&  XLS-R model with 2 billion parameters,
pre-trained on 436,000 hours of unlabeled audio from multiple datasets (
*Multilingual LibriSpeech* :cite:`Pratap_2020`,
*CommonVoice* :cite:`ardila2020common`,
*VoxLingua107* :cite:`valk2021voxlingua107`,
*BABEL* :cite:`Gales2014SpeechRA`, and
*VoxPopuli* :cite:`voxpopuli`) in 128 languages,
not fine-tuned.

Originally published by the authors of *XLS-R* :cite:`babu2021xls` under MIT License and
redistributed with the same license.
[`License <https://github.com/facebookresearch/fairseq/blob/30c912b73c0f88d41171879b2f03226a171004ef/LICENSE>`__,
`Source <https://github.com/facebookresearch/fairseq/tree/30c912b73c0f88d41171879b2f03226a171004ef/examples/wav2vec/xlsr#xls-r>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for usage details.
c                       s   e Zd ZdZG dd dejZG dd dejZde	e
 e
ee
df d	 fd
dZdddeedddZde	e
 e
ee
ef d	ddZedddZedddZ  ZS )Wav2Vec2FABundleu  Data class that bundles associated information to use pretrained :py:class:`~torchaudio.models.Wav2Vec2Model` for forced alignment.

    This class provides interfaces for instantiating the pretrained model along with
    the information necessary to retrieve pretrained weights and additional data
    to be used with the model.

    Torchaudio library instantiates objects of this class, each of which represents
    a different pretrained model. Client code should access pretrained models via these
    instances.

    Please see below for the usage and the available values.

    Example - Feature Extraction
        >>> import torchaudio
        >>>
        >>> bundle = torchaudio.pipelines.MMS_FA
        >>>
        >>> # Build the model and load pretrained weight.
        >>> model = bundle.get_model()
        Downloading:
        100%|███████████████████████████████| 1.18G/1.18G [00:05<00:00, 216MB/s]
        >>>
        >>> # Resample audio to the expected sampling rate
        >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
        >>>
        >>> # Estimate the probability of token distribution
        >>> emission, _ = model(waveform)
        >>>
        >>> # Generate frame-wise alignment
        >>> alignment, scores = torchaudio.functional.forced_align(
        >>>     emission, targets, input_lengths, target_lengths, blank=0)
        >>>
    c                   @   s   e Zd ZdZdS )zWav2Vec2FABundle.TokenizerzInterface of the tokenizerNr%   r&   r'   r(   r   r   r   r   	Tokenizer  s   rZ   c                   @   s   e Zd ZdZdS )zWav2Vec2FABundle.AlignerzInterface of the alignerNrY   r   r   r   r   Aligner  s   r[   *r4   .)starr6   r   c                    s$   t  j|d}|dkr|S ||fS )a  Get the labels corresponding to the feature dimension of emission.

        The first is blank token, and it is customizable.

        Args:
            star (str or None, optional): Change or disable star token. (default: ``"*"``)
            blank (str, optional): Change the blank token. (default: ``'-'``)

        Returns:
            Tuple[str, ...]:
            For models fine-tuned on ASR, returns the tuple of strings representing
            the output class labels.

        Example
            >>> from torchaudio.pipelines import MMS_FA as bundle
            >>> bundle.get_labels()
            ('-', 'a', 'i', 'e', 'n', 'o', 'u', 't', 's', 'r', 'm', 'k', 'l', 'd', 'g', 'h', 'y', 'b', 'p', 'w', 'c', 'v', 'j', 'z', 'f', "'", 'q', 'x', '*')
            >>> bundle.get_labels(star=None)
            ('-', 'a', 'i', 'e', 'n', 'o', 'u', 't', 's', 'r', 'm', 'k', 'l', 'd', 'g', 'h', 'y', 'b', 'p', 'w', 'c', 'v', 'j', 'z', 'f', "'", 'q', 'x')
        r5   N)superr7   )r   r]   r6   labels	__class__r   r   r7     s    zWav2Vec2FABundle.get_labelsTNr   )	with_starr   c                C   sL   t | j| j}t | j|| j}|| t j|| j	d|d}|
  |S )a'  Construct the model and load the pretrained weight.

        The weight file is downloaded from the internet and cached with
        :func:`torch.hub.load_state_dict_from_url`

        Args:
            with_star (bool, optional): If enabled, the last dimension of output layer is
                extended by one, which corresponds to `star` token.
            dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.

        Returns:
            Variation of :py:class:`~torchaudio.models.Wav2Vec2Model`.

            .. note::

               The model created with this method returns probability in log-domain,
               (i.e. :py:func:`torch.nn.functional.log_softmax` is applied), whereas
               the other Wav2Vec2 models returns logit.
        T)r   Zapply_log_softmaxZappend_star)r
   r   r   r   r   r   r3   r   r    r   r!   )r   rb   r   r"   r#   r   r   r   r$   5  s    
   zWav2Vec2FABundle.get_modelc                 C   s   dd t | j||dD S )aM  Get the mapping from token to index (in emission feature dim)

        Args:
            star (str or None, optional): Change or disable star token. (default: ``"*"``)
            blank (str, optional): Change the blank token. (default: ``'-'``)

        Returns:
            Tuple[str, ...]:
            For models fine-tuned on ASR, returns the tuple of strings representing
            the output class labels.

        Example
            >>> from torchaudio.pipelines import MMS_FA as bundle
            >>> bundle.get_dict()
            {'-': 0, 'a': 1, 'i': 2, 'e': 3, 'n': 4, 'o': 5, 'u': 6, 't': 7, 's': 8, 'r': 9, 'm': 10, 'k': 11, 'l': 12, 'd': 13, 'g': 14, 'h': 15, 'y': 16, 'b': 17, 'p': 18, 'w': 19, 'c': 20, 'v': 21, 'j': 22, 'z': 23, 'f': 24, "'": 25, 'q': 26, 'x': 27, '*': 28}
            >>> bundle.get_dict(star=None)
            {'-': 0, 'a': 1, 'i': 2, 'e': 3, 'n': 4, 'o': 5, 'u': 6, 't': 7, 's': 8, 'r': 9, 'm': 10, 'k': 11, 'l': 12, 'd': 13, 'g': 14, 'h': 15, 'y': 16, 'b': 17, 'p': 18, 'w': 19, 'c': 20, 'v': 21, 'j': 22, 'z': 23, 'f': 24, "'": 25, 'q': 26, 'x': 27}
        c                 S   s   i | ]\}}||qS r   r   ).0ikr   r   r   
<dictcomp>e  s      z-Wav2Vec2FABundle.get_dict.<locals>.<dictcomp>)r]   r6   )	enumerater7   )r   r]   r6   r   r   r   get_dictR  s    zWav2Vec2FABundle.get_dictr   c                 C   s   t |  S )zIInstantiate a Tokenizer.

        Returns:
            Tokenizer
        )r	   rZ   rh   r   r   r   r   get_tokenizerg  s    zWav2Vec2FABundle.get_tokenizerc                 C   s   t jddS )zFInstantiate an Aligner.

        Returns:
            Aligner
        r   r5   )r	   r[   r   r   r   r   get_alignero  s    zWav2Vec2FABundle.get_aligner)r\   r4   )T)r\   r4   )r%   r&   r'   r(   r	   Z
ITokenizerrZ   ZIAlignerr[   r   r)   r   r7   r,   r   r$   r   r8   rh   ri   rj   __classcell__r   r   r`   r   rX     s   "$ rX   zQhttps://dl.fbaipublicfiles.com/mms/torchaudio/ctc_alignment_mling_uroman/model.pta  
Trained on 31K hours of data in 1,130 languages from *Scaling Speech Technology to 1,000+ Languages* :cite:`pratap2023scaling`.

Published by the authors of *Scaling Speech Technology to 1,000+ Languages* :cite:`pratap2023scaling` under [`CC-BY-NC 4.0 License <https://github.com/facebookresearch/fairseq/tree/100cd91db19bb27277a06a25eb4154c805b10189/examples/mms#license>`__].

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2FABundle` for usage details.

.. note::

   Unlike other Wav2Vec2 bundles, this model does not have a token for word boundary (like `|`). This makes the post-processing of alignments slightly different.
)6dataclassesr   typingr   r   r   r   Ztorch.nnr    r	   r
   __all__r   r.   ZWAV2VEC2_BASEr(   Z_get_en_labelsZWAV2VEC2_ASR_BASE_10MZWAV2VEC2_ASR_BASE_100HZWAV2VEC2_ASR_BASE_960HZWAV2VEC2_LARGEZWAV2VEC2_ASR_LARGE_10MZWAV2VEC2_ASR_LARGE_100HZWAV2VEC2_ASR_LARGE_960HZWAV2VEC2_LARGE_LV60KZWAV2VEC2_ASR_LARGE_LV60K_10MZWAV2VEC2_ASR_LARGE_LV60K_100HZWAV2VEC2_ASR_LARGE_LV60K_960HZWAV2VEC2_XLSR53ZHUBERT_BASEZHUBERT_LARGEZHUBERT_XLARGEZHUBERT_ASR_LARGEZHUBERT_ASR_XLARGEZ_get_de_labelsZVOXPOPULI_ASR_BASE_10K_DEZ_get_vp_en_labelsZVOXPOPULI_ASR_BASE_10K_ENZ_get_es_labelsZVOXPOPULI_ASR_BASE_10K_ESZ_get_fr_labelsZVOXPOPULI_ASR_BASE_10K_FRZ_get_it_labelsZVOXPOPULI_ASR_BASE_10K_ITZ
WAVLM_BASEZWAVLM_BASE_PLUSZWAVLM_LARGEZWAV2VEC2_XLSR_300MZWAV2VEC2_XLSR_1BZWAV2VEC2_XLSR_2BrX   Z_get_mms_labelsZMMS_FAr   r   r   r   <module>   s  WG	 	!	"	!	 	!	!	!	 	!	!	!	 	 	 	 	!	!	"	"	"	!	"	"	"	"	 	 	  	!