U
    ,:%e                     @   s   d dl mZ d dlZd dlmZ d dlZG dd dejZG dd dejZG dd dejZ	e
eeee	d	d
dZe	dddZdS )    )TupleNc                       s<   e Zd ZdZeed fddZejejdddZ  Z	S )AttPoolzAttention-Pooling module that estimates the attention score.

    Args:
        input_dim (int): Input feature dimension.
        att_dim (int): Attention Tensor dimension.
    	input_dimatt_dimc                    s.   t t|   t|d| _t||| _d S )N   )superr   __init__nnLinearlinear1linear2selfr   r   	__class__ a/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/torchaudio/models/squim/subjective.pyr	      s    zAttPool.__init__xreturnc                 C   sF   |  |}|dd}tjj|dd}t||d}| |}|S )zApply attention and pooling.

        Args:
            x (torch.Tensor): Input Tensor with dimensions `(batch, time, feature_dim)`.

        Returns:
            (torch.Tensor): Attention score with dimensions `(batch, att_dim)`.
           r   dim)	r   Z	transposer
   
functionalsoftmaxtorchmatmulZsqueezer   )r   r   Zattr   r   r   forward   s    


zAttPool.forward
__name__
__module____qualname____doc__intr	   r   Tensorr   __classcell__r   r   r   r   r      s   r   c                       s<   e Zd ZdZeed fddZejejdddZ  Z	S )	PredictorzPrediction module that apply pooling and attention, then predict subjective metric scores.

    Args:
        input_dim (int): Input feature dimension.
        att_dim (int): Attention Tensor dimension.
    r   c                    s$   t t|   t||| _|| _d S N)r   r'   r	   r   att_pool_layerr   r   r   r   r   r	   0   s    zPredictor.__init__r   c                 C   sD   |  |}tjj|dd}tjdd| j|jd}|| jdd}|S )a  Predict subjective evaluation metric score.

        Args:
            x (torch.Tensor): Input Tensor with dimensions `(batch, time, feature_dim)`.

        Returns:
            (torch.Tensor): Subjective metric score. Tensor with dimensions `(batch,)`.
        r   r   r      )Zstepsdevice)	r)   r
   r   r   r   Zlinspacer   r+   sum)r   r   Br   r   r   r   5   s
    	
zPredictor.forwardr   r   r   r   r   r'   (   s   r'   c                       sf   e Zd ZdZejejejd fddZejeje	ejejf dddZ
ejejdd	d
Z  ZS )SquimSubjectiveaP  Speech Quality and Intelligibility Measures (SQUIM) model that predicts **subjective** metric scores
    for speech enhancement (e.g., Mean Opinion Score (MOS)). The model is adopted from *NORESQA-MOS*
    :cite:`manocha2022speech` which predicts MOS scores given the input speech and a non-matching reference.

    Args:
        ssl_model (torch.nn.Module): The self-supervised learning model for feature extraction.
        projector (torch.nn.Module): Projection layer that projects SSL feature to a lower dimension.
        predictor (torch.nn.Module): Predict the subjective scores.
    )	ssl_model	projector	predictorc                    s$   t t|   || _|| _|| _d S r(   )r   r.   r	   r/   r0   r1   )r   r/   r0   r1   r   r   r   r	   P   s    zSquimSubjective.__init__)waveform	referencer   c                    s`   |j d } j d }||k rH|| d }tj fddt|D dd | ddd|f fS )a  Cut or pad the reference Tensor to make it aligned with waveform Tensor.

        Args:
            waveform (torch.Tensor): Input waveform for evaluation. Tensor with dimensions `(batch, time)`.
            reference (torch.Tensor): Non-matching clean reference. Tensor with dimensions `(batch, time_ref)`.

        Returns:
            (torch.Tensor, torch.Tensor): The aligned waveform and reference Tensors
                with same dimensions `(batch, time)`.
        r   c                    s   g | ]} qS r   r   ).0_r3   r   r   
<listcomp>e   s     z1SquimSubjective._align_shapes.<locals>.<listcomp>r   N)shaper   catrange)r   r2   r3   Z
T_waveformZT_referenceZnum_paddingr   r7   r   _align_shapesV   s    

 zSquimSubjective._align_shapes)r2   r3   c                 C   sh   |  ||\}}| | j|d d }| | j|d d }tj||fdd}| |}d| S )a  Predict subjective evaluation metric score.

        Args:
            waveform (torch.Tensor): Input waveform for evaluation. Tensor with dimensions `(batch, time)`.
            reference (torch.Tensor): Non-matching clean reference. Tensor with dimensions `(batch, time_ref)`.

        Returns:
            (torch.Tensor): Subjective metric score. Tensor with dimensions `(batch,)`.
        r   r4   r   r      )r<   r0   r/   Zextract_featuresr   r:   r1   )r   r2   r3   concatZ
score_diffr   r   r   r   h   s    

zSquimSubjective.forward)r    r!   r"   r#   r
   Moduler	   r   r%   r   r<   r   r&   r   r   r   r   r.   E   s   
"r.   )ssl_typefeat_dimproj_dimr   r   c                 C   s4   t tj|  }t||}t|d |}t|||S )a  Build a custome :class:`torchaudio.prototype.models.SquimSubjective` model.

    Args:
        ssl_type (str): Type of self-supervised learning (SSL) models.
            Must be one of ["wav2vec2_base", "wav2vec2_large"].
        feat_dim (int): Feature dimension of the SSL feature representation.
        proj_dim (int): Output dimension of projection layer.
        att_dim (int): Dimension of attention scores.
    r   )getattr
torchaudiomodelsr
   r   r'   r.   )r@   rA   rB   r   r/   r0   r1   r   r   r   squim_subjective_modelz   s    rF   )r   c                   C   s   t dddddS )zXBuild :class:`torchaudio.prototype.models.SquimSubjective` model with default arguments.Zwav2vec2_basei       r=   )r@   rA   rB   r   )rF   r   r   r   r   squim_subjective_base   s    rH   )typingr   r   Ztorch.nnr
   rD   r?   r   r'   r.   strr$   rF   rH   r   r   r   r   <module>   s    6