U
    ,:%e¥  ã                   @   s†   d dl mZ d dlZd dlmZ d dlZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZ	e
eeee	d	œd
d„Ze	dœdd„ZdS )é    )ÚTupleNc                       s<   e Zd ZdZeedœ‡ fdd„Zejejdœdd„Z‡  Z	S )ÚAttPoolz±Attention-Pooling module that estimates the attention score.

    Args:
        input_dim (int): Input feature dimension.
        att_dim (int): Attention Tensor dimension.
    ©Ú	input_dimÚatt_dimc                    s.   t t| ƒ ¡  t |d¡| _t ||¡| _d S )Né   )Úsuperr   Ú__init__ÚnnÚLinearÚlinear1Úlinear2©Úselfr   r   ©Ú	__class__© úa/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/torchaudio/models/squim/subjective.pyr	      s    zAttPool.__init__©ÚxÚreturnc                 C   sF   |   |¡}| dd¡}tjj|dd}t ||¡ d¡}|  |¡}|S )zïApply attention and pooling.

        Args:
            x (torch.Tensor): Input Tensor with dimensions `(batch, time, feature_dim)`.

        Returns:
            (torch.Tensor): Attention score with dimensions `(batch, att_dim)`.
        é   r   ©Údim)	r   Z	transposer
   Ú
functionalÚsoftmaxÚtorchÚmatmulZsqueezer   )r   r   Zattr   r   r   Úforward   s    


zAttPool.forward©
Ú__name__Ú
__module__Ú__qualname__Ú__doc__Úintr	   r   ÚTensorr   Ú__classcell__r   r   r   r   r      s   r   c                       s<   e Zd ZdZeedœ‡ fdd„Zejejdœdd„Z‡  Z	S )Ú	PredictorzÏPrediction module that apply pooling and attention, then predict subjective metric scores.

    Args:
        input_dim (int): Input feature dimension.
        att_dim (int): Attention Tensor dimension.
    r   c                    s$   t t| ƒ ¡  t||ƒ| _|| _d S ©N)r   r'   r	   r   Úatt_pool_layerr   r   r   r   r   r	   0   s    zPredictor.__init__r   c                 C   sD   |   |¡}tjj|dd}tjdd| j|jd}|| jdd}|S )a  Predict subjective evaluation metric score.

        Args:
            x (torch.Tensor): Input Tensor with dimensions `(batch, time, feature_dim)`.

        Returns:
            (torch.Tensor): Subjective metric score. Tensor with dimensions `(batch,)`.
        r   r   r   é   )ZstepsÚdevice)	r)   r
   r   r   r   Zlinspacer   r+   Úsum)r   r   ÚBr   r   r   r   5   s
    	
zPredictor.forwardr   r   r   r   r   r'   (   s   r'   c                       sf   e Zd ZdZejejejdœ‡ fdd„Zejeje	ejejf dœdd„Z
ejejdœd	d
„Z‡  ZS )ÚSquimSubjectiveaP  Speech Quality and Intelligibility Measures (SQUIM) model that predicts **subjective** metric scores
    for speech enhancement (e.g., Mean Opinion Score (MOS)). The model is adopted from *NORESQA-MOS*
    :cite:`manocha2022speech` which predicts MOS scores given the input speech and a non-matching reference.

    Args:
        ssl_model (torch.nn.Module): The self-supervised learning model for feature extraction.
        projector (torch.nn.Module): Projection layer that projects SSL feature to a lower dimension.
        predictor (torch.nn.Module): Predict the subjective scores.
    )Ú	ssl_modelÚ	projectorÚ	predictorc                    s$   t t| ƒ ¡  || _|| _|| _d S r(   )r   r.   r	   r/   r0   r1   )r   r/   r0   r1   r   r   r   r	   P   s    zSquimSubjective.__init__)ÚwaveformÚ	referencer   c                    s`   |j d }ˆ j d }||k rH|| d }tj‡ fdd„t|ƒD ƒdd‰ |ˆ dd…d|…f fS )aÙ  Cut or pad the reference Tensor to make it aligned with waveform Tensor.

        Args:
            waveform (torch.Tensor): Input waveform for evaluation. Tensor with dimensions `(batch, time)`.
            reference (torch.Tensor): Non-matching clean reference. Tensor with dimensions `(batch, time_ref)`.

        Returns:
            (torch.Tensor, torch.Tensor): The aligned waveform and reference Tensors
                with same dimensions `(batch, time)`.
        éÿÿÿÿr   c                    s   g | ]}ˆ ‘qS r   r   )Ú.0Ú_©r3   r   r   Ú
<listcomp>e   s     z1SquimSubjective._align_shapes.<locals>.<listcomp>r   N)Úshaper   ÚcatÚrange)r   r2   r3   Z
T_waveformZT_referenceZnum_paddingr   r7   r   Ú_align_shapesV   s    

 zSquimSubjective._align_shapes)r2   r3   c                 C   sh   |   ||¡\}}|  | j |¡d d ¡}|  | j |¡d d ¡}tj||fdd}|  |¡}d| S )a‰  Predict subjective evaluation metric score.

        Args:
            waveform (torch.Tensor): Input waveform for evaluation. Tensor with dimensions `(batch, time)`.
            reference (torch.Tensor): Non-matching clean reference. Tensor with dimensions `(batch, time_ref)`.

        Returns:
            (torch.Tensor): Subjective metric score. Tensor with dimensions `(batch,)`.
        r   r4   r   r   é   )r<   r0   r/   Zextract_featuresr   r:   r1   )r   r2   r3   ÚconcatZ
score_diffr   r   r   r   h   s    

zSquimSubjective.forward)r    r!   r"   r#   r
   ÚModuler	   r   r%   r   r<   r   r&   r   r   r   r   r.   E   s   
"r.   )Ússl_typeÚfeat_dimÚproj_dimr   r   c                 C   s4   t tj| ƒƒ }t ||¡}t|d |ƒ}t|||ƒS )a£  Build a custome :class:`torchaudio.prototype.models.SquimSubjective` model.

    Args:
        ssl_type (str): Type of self-supervised learning (SSL) models.
            Must be one of ["wav2vec2_base", "wav2vec2_large"].
        feat_dim (int): Feature dimension of the SSL feature representation.
        proj_dim (int): Output dimension of projection layer.
        att_dim (int): Dimension of attention scores.
    r   )ÚgetattrÚ
torchaudioÚmodelsr
   r   r'   r.   )r@   rA   rB   r   r/   r0   r1   r   r   r   Úsquim_subjective_modelz   s    rF   )r   c                   C   s   t dddddS )zXBuild :class:`torchaudio.prototype.models.SquimSubjective` model with default arguments.Zwav2vec2_basei   é    r=   )r@   rA   rB   r   )rF   r   r   r   r   Úsquim_subjective_base   s    ürH   )Útypingr   r   Ztorch.nnr
   rD   r?   r   r'   r.   Ústrr$   rF   rH   r   r   r   r   Ú<module>   s    6û