U
    ,-e                     @   s  d Z ddlZddlmZmZmZ ddlZddlZddl	Zddlm
Z
 ddlmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ ee Z!dZ"dZ#dZ$dddgZ%dZ&dZ'dZ(dZ)dZ*dgZ+dFee,e,f e-e,eej. e,ej/dddZ0G dd de
j1Z2G dd de
j1Z3G dd  d e
j1Z4G d!d" d"e
j1Z5G d#d$ d$e
j1Z6G d%d& d&e
j1Z7G d'd( d(e7Z8G d)d* d*e
j1Z9G d+d, d,e
j1Z:G d-d. d.e
j1Z;G d/d0 d0e
j1Z<G d1d2 d2e
j1Z=G d3d4 d4e
j1Z>G d5d6 d6e
j1Z?G d7d8 d8e
j1Z@G d9d: d:eZAd;ZBd<ZCed=eBG d>d? d?eAZDed@eBG dAdB dBeAZEedCeBG dDdE dEeAZFdS )Gz PyTorch Hubert model.    N)OptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)BaseModelOutputCausalLMOutputSequenceClassifierOutput)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )HubertConfigr   zfacebook/hubert-large-ls960-fti$  i   z['MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'gGz6@zsuperb/hubert-base-superb-ksz'_unknown_'g(\!@zfacebook/hubert-base-ls960)shape	mask_probmask_lengthattention_mask	min_masksreturnc                    s  | \}dk rt dkr6t d d dtjd   fdd}|dk	rt|d	  nfd
dt|D }tj	|ft
d}g }	|}
|
dkr|S |D ]v}||}tjjt|d  |dd}t|dkrd }n|d }t|tj|
| tjd| g}|	| qt|	}	t|	dddddf ||
f}	|	||
 }	tddddf }t|||
f||
 }|	| }	|	 d kr҈d |	|	d k< t||	dd	 |S )af  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    sX   t |     }t|}| kr2 }| d  |k rTt| d  d}|S )z;Given input length, compute how many spans should be maskedr   r   )intmax)input_lengthnum_masked_spanepsilonr   r   r   sequence_length k/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/hubert/modeling_hubert.pycompute_num_masked_spanj   s    
z6_compute_mask_indices.<locals>.compute_num_masked_spanNc                    s   g | ]} qS r#   r#   .0_)r"   r#   r$   
<listcomp>}   s     z)_compute_mask_indices.<locals>.<listcomp>dtyper   F)replace)
ValueErrornprandomranditemsumdetachtolistrangezerosboolchoicearangelenZconcatenateonesZint32appendarrayZbroadcast_toreshaper   Zput_along_axis)r   r   r   r   r   
batch_sizer%   input_lengthsZspec_aug_maskZspec_aug_mask_idxsZmax_num_masked_spanr   r   Zspec_aug_mask_idxZdummy_mask_idxoffsetsr#   r    r$   _compute_mask_indicesD   s`      

  rC   c                       s&   e Zd Zd fdd	Zdd Z  ZS )HubertNoLayerNormConvLayerr   c                    sj   t    |dkr |j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _d S )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr   feat_extract_activation
activationselfconfiglayer_id	__class__r#   r$   rJ      s    
z#HubertNoLayerNormConvLayer.__init__c                 C   s   |  |}| |}|S N)rR   rT   rV   hidden_statesr#   r#   r$   forward   s    

z"HubertNoLayerNormConvLayer.forward)r   __name__
__module____qualname__rJ   r^   __classcell__r#   r#   rY   r$   rD      s   rD   c                       s&   e Zd Zd fdd	Zdd Z  ZS )HubertLayerNormConvLayerr   c                    s|   t    |dkr |j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
tj| jdd| _t|j | _d S )Nr   r   rE   T)Zelementwise_affine)rI   rJ   rK   rL   rM   r   rN   rO   rP   rQ   rR   	LayerNorm
layer_normr   rS   rT   rU   rY   r#   r$   rJ      s    
z!HubertLayerNormConvLayer.__init__c                 C   s:   |  |}|dd}| |}|dd}| |}|S )Nr&   )rR   	transposerf   rT   r\   r#   r#   r$   r^      s    


z HubertLayerNormConvLayer.forward)r   r_   r#   r#   rY   r$   rd      s   rd   c                       s&   e Zd Zd fdd	Zdd Z  ZS )HubertGroupNormConvLayerr   c                    s   t    |dkr |j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _tj| j| jdd| _d S )Nr   r   rE   T)Z
num_groupsZnum_channelsZaffine)rI   rJ   rK   rL   rM   r   rN   rO   rP   rQ   rR   r   rS   rT   	GroupNormrf   rU   rY   r#   r$   rJ      s    
z!HubertGroupNormConvLayer.__init__c                 C   s"   |  |}| |}| |}|S r[   )rR   rf   rT   r\   r#   r#   r$   r^      s    


z HubertGroupNormConvLayer.forward)r   r_   r#   r#   rY   r$   ri      s   ri   c                       s$   e Zd Z fddZdd Z  ZS )HubertPositionalConvEmbeddingc              	      s   t    tj|j|j|j|jd |jd| _tjj	}t
tjjdrNtjjj	}t rdd l}|jj| jjdd || jddd| _W 5 Q R X |j| | jj |j| | jj n|| jddd| _t|j| _t|j | _d S )N   )rF   paddinggroupsweight_normr   Zmodifier_rankweight)namedim)rI   rJ   r   rN   hidden_sizenum_conv_pos_embeddingsZnum_conv_pos_embedding_groupsrR   utilsro   hasattrZparametrizationsr	   	deepspeedzeroGatheredParametersrq   Zregister_external_parameterweight_vweight_gHubertSamePadLayerrm   r   rS   rT   )rV   rW   ro   rx   rY   r#   r$   rJ     s(    

z&HubertPositionalConvEmbedding.__init__c                 C   s:   | dd}| |}| |}| |}| dd}|S Nr   rl   )rh   rR   rm   rT   r\   r#   r#   r$   r^   #  s    


z%HubertPositionalConvEmbedding.forwardr_   r#   r#   rY   r$   rk     s   rk   c                       s$   e Zd Z fddZdd Z  ZS )r}   c                    s$   t    |d dkrdnd| _d S )Nrl   r   r   )rI   rJ   num_pad_remove)rV   ru   rY   r#   r$   rJ   0  s    
zHubertSamePadLayer.__init__c                 C   s,   | j dkr(|d d d d d | j  f }|S )Nr   )r   r\   r#   r#   r$   r^   4  s    
zHubertSamePadLayer.forwardr_   r#   r#   rY   r$   r}   /  s   r}   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )HubertFeatureEncoderz.Construct the features from raw audio waveformc                    s   t     jdkr@t ddg fddt jd D  }n6 jdkrd fddt jD }ntd	 j d
t|| _	d| _
d| _d S )Ngroupr   rX   c                    s   g | ]}t  |d  dqS )r   r   )rD   r(   irW   r#   r$   r*   B  s    z1HubertFeatureEncoder.__init__.<locals>.<listcomp>r   layerc                    s   g | ]}t  |d qS )r   )rd   r   r   r#   r$   r*   F  s     z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)rI   rJ   Zfeat_extract_normri   r6   Znum_feat_extract_layersr.   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)rV   rW   r   rY   r   r$   rJ   >  s    



zHubertFeatureEncoder.__init__c                 C   s   |   D ]
}d|_qd| _d S )NF)
parametersrequires_gradr   rV   paramr#   r#   r$   _freeze_parametersO  s    z'HubertFeatureEncoder._freeze_parametersc                 C   sj   |d d d f }| j r"| jr"d|_| jD ]<}| j r\| jr\| jr\dd }tjj|||}q(||}q(|S )NTc                    s    fdd}|S )Nc                     s    |  S r[   r#   inputsmoduler#   r$   custom_forward_  s    zSHubertFeatureEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr#   r   r   r#   r   r$   create_custom_forward^  s    z;HubertFeatureEncoder.forward.<locals>.create_custom_forward)r   trainingr   r   r   torchrv   
checkpoint)rV   input_valuesr]   Z
conv_layerr   r#   r#   r$   r^   T  s    

zHubertFeatureEncoder.forward)r`   ra   rb   __doc__rJ   r   r^   rc   r#   r#   rY   r$   r   ;  s   r   c                       s   e Zd Z fddZ  ZS )HubertFeatureExtractorc                    s8   t  | td| jj d| jjd j dt d S )NzThe class `zD` has been depreciated and will be removed in Transformers v5. Use `r   z
` instead.)rI   rJ   warningswarnrZ   r`   	__bases__FutureWarningrV   rW   rY   r#   r$   rJ   o  s
    zHubertFeatureExtractor.__init__)r`   ra   rb   rJ   rc   r#   r#   rY   r$   r   n  s   r   c                       s$   e Zd Z fddZdd Z  ZS )HubertFeatureProjectionc                    sX   t    |j| _| jr0tj|jd |jd| _t|jd |j	| _
t|j| _d S )Nr&   Zeps)rI   rJ   feat_proj_layer_normr   re   rK   layer_norm_epsrf   Linearrt   
projectionDropoutZfeat_proj_dropoutdropoutr   rY   r#   r$   rJ   z  s    
z HubertFeatureProjection.__init__c                 C   s(   | j r| |}| |}| |}|S r[   )r   rf   r   r   r\   r#   r#   r$   r^     s
    


zHubertFeatureProjection.forwardr_   r#   r#   rY   r$   r   y  s   r   c                       s   e Zd ZdZdeeeeed fddZej	eedd	d
Z
dej	eej	 eeej	  eej	 eej	 eeej	eej	 eeej	  f dddZ  ZS )HubertAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        FT)	embed_dim	num_headsr   
is_decoderrH   c                    s   t    || _|| _|| _|| | _| j| | jkrNtd| j d| d| jd | _|| _t	j
|||d| _t	j
|||d| _t	j
|||d| _t	j
|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      )rH   )rI   rJ   r   r   r   head_dimr.   scalingr   r   r   k_projv_projq_projout_proj)rV   r   r   r   r   rH   rY   r#   r$   rJ     s    

zHubertAttention.__init__)tensorseq_lenbszc                 C   s    | ||| j| jdd S r~   )viewr   r   rh   
contiguous)rV   r   r   r   r#   r#   r$   _shape  s    zHubertAttention._shapeN)r]   key_value_statespast_key_valuer   layer_head_maskoutput_attentionsr   c                 C   sx  |dk	}|  \}}	}
| || j }|r\|dk	r\|d jd |jd kr\|d }|d }n|r| | |d|}| | |d|}n|dk	r| | |d|}| | |d|}tj|d |gdd}tj|d |gdd}n(| | |d|}| | |d|}| j	r ||f}|| j
 d| jf}| ||	|j| }|j| }|j| }| d}t||dd}|  || j
 |	|fkrtd|| j
 |	|f d|   |dk	r |  |d|	|fkrtd	|d|	|f d|   ||| j
|	|| }||| j
 |	|}tjj|dd}|dk	r|  | j
fkrhtd
| j
f d|   |dddd||| j
|	| }||| j
 |	|}|r||| j
|	|}||| j
 |	|}nd}tjj|| j| jd}t||}|  || j
 |	| jfkr4td|| j
 |	| jf d|   ||| j
|	| j}|dd}|||	| j}| |}|||fS )z#Input shape: Batch x Time x ChannelNr   rl   r   r&   rs   z$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size )pr   z `attn_output` should be of size )sizer   r   r   r   r   r   r   catr   r   r   r   r?   Zbmmrh   r.   r   
functionalsoftmaxr   r   r   r   )rV   r]   r   r   r   r   r   Zis_cross_attentionr   Ztgt_lenr)   Zquery_statesZ
key_statesZvalue_statesZ
proj_shapeZsrc_lenattn_weightsZattn_weights_reshapedZ
attn_probsZattn_outputr#   r#   r$   r^     s~    





" 
zHubertAttention.forward)r   FT)NNNNF)r`   ra   rb   r   r   floatr8   rJ   r   Tensorr   r   r   r^   rc   r#   r#   rY   r$   r     s4           r   c                       s$   e Zd Z fddZdd Z  ZS )HubertFeedForwardc                    sp   t    t|j| _t|j|j| _	t
|jtrDt|j | _n|j| _t|j|j| _t|j| _d S r[   )rI   rJ   r   r   Zactivation_dropoutintermediate_dropoutr   rt   Zintermediate_sizeintermediate_dense
isinstanceZ
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr   rY   r#   r$   rJ   (  s    
zHubertFeedForward.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S r[   )r   r   r   r   r   r\   r#   r#   r$   r^   5  s    




zHubertFeedForward.forwardr_   r#   r#   rY   r$   r   '  s   r   c                       s&   e Zd Z fddZdddZ  ZS )HubertEncoderLayerc                    sf   t    t|j|j|jdd| _t|j	| _
tj|j|jd| _t|| _tj|j|jd| _d S )NFr   r   r   r   r   )rI   rJ   r   rt   num_attention_headsattention_dropout	attentionr   r   r   r   re   r   rf   r   feed_forwardfinal_layer_normr   rY   r#   r$   rJ   A  s    

zHubertEncoderLayer.__init__NFc                 C   sf   |}| j |||d\}}}| |}|| }| |}|| | }| |}|f}|rb||f7 }|S Nr   r   )r   r   rf   r   r   rV   r]   r   r   Zattn_residualr   r)   outputsr#   r#   r$   r^   N  s      



zHubertEncoderLayer.forward)NFr_   r#   r#   rY   r$   r   @  s   r   c                       s,   e Zd Z fddZejdddZ  ZS )HubertAttnAdapterLayerc                    sZ   t    |j| _|j| _t| j| _t	| j| j| _
t | _t	| j| j| _dS )z
        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
        up training throughput.
        N)rI   rJ   adapter_attn_dimZ	input_dimrt   Z
hidden_dimr   re   normr   linear_1ZReLUact_fnlinear_2r   rY   r#   r$   rJ   d  s    

zHubertAttnAdapterLayer.__init__)r]   c                 C   s,   |  |}| |}| |}| |}|S r[   )r   r   r   r   r\   r#   r#   r$   r^   r  s
    



zHubertAttnAdapterLayer.forward)r`   ra   rb   rJ   r   FloatTensorr^   rc   r#   r#   rY   r$   r   c  s   r   c                       s8   e Zd Z fddZdejeej edddZ  Z	S )	!HubertEncoderLayerStableLayerNormc                    s   t    t|j|j|jdd| _t|j	| _
tj|j|jd| _t|| _tj|j|jd| _t|dd d k	r~t|| _nd | _d S )NFr   r   r   )rI   rJ   r   rt   r   r   r   r   r   r   r   re   r   rf   r   r   r   getattrr   adapter_layerr   rY   r#   r$   rJ   ~  s    

z*HubertEncoderLayerStableLayerNorm.__init__NF)r]   r   r   c                 C   sz   |}|  |}| j|||d\}}}| |}|| }|| | | }| jd k	rb|| | }|f}|rv||f7 }|S r   )rf   r   r   r   r   r   r   r#   r#   r$   r^     s     
  


z)HubertEncoderLayerStableLayerNorm.forward)NF)
r`   ra   rb   rJ   r   r   r   r8   r^   rc   r#   r#   rY   r$   r   }  s     r   c                       s<   e Zd Z fddZd	ejeej eeedddZ	  Z
S )
HubertEncoderc                    sf   t     | _t | _tj j jd| _	t
 j| _t fddt jD | _d| _d S )Nr   c                    s   g | ]}t  qS r#   )r   r'   r   r#   r$   r*     s     z*HubertEncoder.__init__.<locals>.<listcomp>FrI   rJ   rW   rk   pos_conv_embedr   re   rt   r   rf   r   r   r   r   r6   num_hidden_layerslayersr   r   rY   r   r$   rJ     s    

 zHubertEncoder.__init__NFT)r]   r   r   output_hidden_statesreturn_dictc                    s  |rdnd } rdnd }|d k	r| ddd|jd }d|| < d|d d d d d d f j|jd }|t|jj }||jd d|jd |jd }| 	|}	||	 }| 
|}| |}t }
| jD ]}|r||f }tg }| jr|| jjk rdnd	}|r|
r`| jrJ| jrJ fd
d}tjj||||}n||| d}|d }|rjd} r||d f }q|r||f }|stdd |||fD S t|||dS )Nr#   r&   r   rl   r         ?r+   TFc                    s    fdd}|S )Nc                     s    | f S r[   r#   r   r   r   r#   r$   r     s    zLHubertEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr#   r   r   r   r$   r     s    z4HubertEncoder.forward.<locals>.create_custom_forwardr   NNc                 s   s   | ]}|d k	r|V  qd S r[   r#   r(   vr#   r#   r$   	<genexpr>  s      z(HubertEncoder.forward.<locals>.<genexpr>Zlast_hidden_stater]   
attentions)	unsqueezerepeatr   tor,   r   finfominexpandr   rf   r   r	   r   r1   r   rW   	layerdropr   rv   r   tupler
   rV   r]   r   r   r   r   Zall_hidden_statesZall_self_attentionsZexpand_attention_maskZposition_embeddingsZdeepspeed_zero3_is_enabledr   Zdropout_probabilityZskip_the_layerr   Zlayer_outputsr#   r   r$   r^     sd    
&   





  
zHubertEncoder.forward)NFFT)r`   ra   rb   rJ   r   r   r   r   r8   r^   rc   r#   r#   rY   r$   r     s       r   c                       s&   e Zd Z fddZdddZ  ZS )	HubertEncoderStableLayerNormc                    sf   t     | _t | _tj j jd| _	t
 j| _t fddt jD | _d| _d S )Nr   c                    s   g | ]}t  qS r#   )r   r'   r   r#   r$   r*     s     z9HubertEncoderStableLayerNorm.__init__.<locals>.<listcomp>Fr   r   rY   r   r$   rJ     s    

z%HubertEncoderStableLayerNorm.__init__NFTc                    s  |rdnd } rdnd }|d k	r| ddd|jd }d|| < d|d d d d d d f j|jd }|t|jj }||jd d|jd |jd }| 	|}	||	 }| 
|}t }
| jD ]}|r||f }tg }| jr|| jjk rdnd	}|r|
rR| jr<| jr< fd
d}tjj||||}n||| d}|d }|r\d} r||d f }q| |}|r||f }|stdd |||fD S t|||dS )Nr#   r&   r   rl   r   r   r+   TFc                    s    fdd}|S )Nc                     s    | f S r[   r#   r   r   r#   r$   r   :  s    z[HubertEncoderStableLayerNorm.forward.<locals>.create_custom_forward.<locals>.custom_forwardr#   r   r   r   r$   r   9  s    zCHubertEncoderStableLayerNorm.forward.<locals>.create_custom_forwardr   r   c                 s   s   | ]}|d k	r|V  qd S r[   r#   r   r#   r#   r$   r   V  s      z7HubertEncoderStableLayerNorm.forward.<locals>.<genexpr>r   )r   r   r   r   r,   r   r   r   r   r   r   r	   r   r1   r   rW   r   r   rv   r   rf   r   r
   r   r#   r   r$   r^     sd    
&   




  

z$HubertEncoderStableLayerNorm.forward)NFFTr_   r#   r#   rY   r$   r     s       r   c                   @   s\   e Zd ZdZeZdZdZdZdd Z	ddd	Z
eejef d
ddZeejdddZdS )HubertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    hubertr   Tc              	   C   s&  t |tjr$|jjjd| jjd nt |tjtj	frR|j
j  |jjd nt |tjrt rddl}t|drt|dr|jj|j|jgdd tj|jj W 5 Q R X q|jj|jdd tj|jj W 5 Q R X ntj|jj t |tjtjfr"|j
dk	r"|j
j  dS )	zInitialize the weightsr   )meanZstdr   r   Nr{   r|   rp   )r   r   r   rq   dataZnormal_rW   Zinitializer_rangere   rj   rH   Zzero_Zfill_rN   r	   rx   rw   ry   rz   r{   r|   initZkaiming_normal_)rV   r   rx   r#   r#   r$   _init_weightsi  s      z#HubertPreTrainedModel._init_weightsFc                 C   s   t |ttfr||_d S r[   )r   r   r   r   )rV   r   valuer#   r#   r$   _set_gradient_checkpointing  s    z1HubertPreTrainedModel._set_gradient_checkpointing)rA   c                 C   s4   dd }t | jj| jjD ]\}}||||}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )Nfloor)Zrounding_moder   )r   div)r   rF   rG   r#   r#   r$   _conv_out_length  s    zPHubertPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length)ziprW   rO   rP   )rV   rA   r
  rF   rG   r#   r#   r$    _get_feat_extract_output_lengths  s    z6HubertPreTrainedModel._get_feat_extract_output_lengths)feature_vector_lengthr   c                 C   s~   |  |dtj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dgd
dg }|S )Nr&   r   )r,   devicer   )r  )r  r3   r   r   longr   r7   r,   r  r:   flipZcumsumr8   )rV   r  r   Zoutput_lengthsr@   r#   r#   r$   "_get_feature_vector_attention_mask  s    
  "z8HubertPreTrainedModel._get_feature_vector_attention_maskN)F)r`   ra   rb   r   r   config_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingr  r  r   r   
LongTensorr   r  r  r#   r#   r#   r$   r   ^  s   
r   a!  
    Hubert was proposed in [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden
    Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia,
    Ruslan Salakhutdinov, Abdelrahman Mohamed.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving etc.).

    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`HubertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
            1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            <Tip warning={true}>

            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
            True`. For all models whose processor has `config.return_attention_mask == False`, such as
            [hubert-base](https://huggingface.co/facebook/hubert-base-ls960), `attention_mask` should **not** be passed
            to avoid degraded performance when doing batched inference. For such models `input_values` should simply be
            padded with 0 and passed without `attention_mask`. Be aware that these models also yield slightly different
            results depending on whether `input_values` is padded or not.

            </Tip>

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z`The bare Hubert Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Zed fddZdejeej eej dddZ	e
eeeeddeej eej eej ee ee ee eeef d	d
dZ  ZS )HubertModelr   c                    sz   t  | || _t|| _t|| _|jdks:|jdkrRt	
t|j | _|jrdt|| _n
t|| _|   d S )Nr   )rI   rJ   rW   r   feature_extractorr   feature_projectionmask_time_probmask_feature_probr   	Parameterr   r   rt   Zuniform_masked_spec_embedZdo_stable_layer_normr   encoderr   	post_initr   rY   r#   r$   rJ     s    


zHubertModel.__init__N)r]   mask_time_indicesr   c                 C   s  t | jdds|S | \}}}|dk	r<| j|j||< nZ| jjdkr| jrt||f| jj| jj	|| jj
d}tj||jtjd}| j|j||< | jjdkr| jrt||f| jj| jj| jjd}tj||jtjd}|dddf d|d}d||< |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        Zapply_spec_augmentTNr   )r   r   r   r   )r  r,   )r   r   r   r&   )r   rW   r   r  r   r,   r  r   rC   Zmask_time_lengthZmask_time_min_masksr   r   r  r8   r  Zmask_feature_lengthZmask_feature_min_masksr   )rV   r]   r  r   r@   r"   rt   Zmask_feature_indicesr#   r#   r$   _mask_hidden_states  s4    zHubertModel._mask_hidden_states)output_typer  )r   r   r  r   r   r   r   c           
      C   s   |dk	r|n| j j}|dk	r |n| j j}|dk	r4|n| j j}| |}|dd}|dk	rl| |jd |}| |}| j	||d}| j
|||||d}	|	d }|s|f|	dd  S t||	j|	jdS )aZ  

        Returns:

        Example:

        ```python
        >>> from transformers import AutoProcessor, HubertModel
        >>> from datasets import load_dataset
        >>> import soundfile as sf

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(batch):
        ...     speech, _ = sf.read(batch["file"])
        ...     batch["speech"] = speech
        ...     return batch


        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
        ```Nr   rl   )r  r   r   r   r   r   r   )rW   r   r   use_return_dictr  rh   r  r   r  r  r  r
   r]   r   )
rV   r   r   r  r   r   r   Zextract_featuresr]   Zencoder_outputsr#   r#   r$   r^      s2    &

zHubertModel.forward)NN)NNNNN)r`   ra   rb   r   rJ   r   r   r   r  r  r   HUBERT_INPUTS_DOCSTRINGr   r
   _CONFIG_FOR_DOCr   r8   r   r   r^   rc   r#   r#   rY   r$   r    s0     .
     
r  zdHubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).c                       s   e Zd Zdee d fddZdd Zdd Zd	d
 Zdd Z	e
eeeeeeeddeej eej ee ee ee eej eeef dddZ  ZS )HubertForCTCN)target_langc                    s~   t  | t|| _t|j| _|| _|j	d krFt
d| j dt|dr\|jr\|jn|j}t||j	| _|   d S )NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `HubertForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.add_adapter)rI   rJ   r  r  r   r   Zfinal_dropoutr   r%  
vocab_sizer.   rZ   rw   r&  output_hidden_sizert   r   lm_headr  )rV   rW   r%  r(  rY   r#   r$   rJ   p  s    

zHubertForCTC.__init__c                 C   sr   | j }|dk	r2t| jdddkr2td| dn<|dkrXt| jdddk	rXtd n|dk	rn| j|dd dS )a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        Nr   zCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)Z
force_load)r%  r   rW   r.   loggerinfoZload_adapter)rV   r%  r#   r#   r$   tie_weights  s    zHubertForCTC.tie_weightsc                 C   s   t dt |   dS )
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5.Please use the equivalent `freeze_feature_encoder` method instead.Nr   r   r   freeze_feature_encoderrV   r#   r#   r$   freeze_feature_extractor  s
    z%HubertForCTC.freeze_feature_extractorc                 C   s   | j j  dS r-  Nr  r  r   r1  r#   r#   r$   r0    s    z#HubertForCTC.freeze_feature_encoderc                 C   s   | j  D ]
}d|_q
dS z
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNr  r   r   r   r#   r#   r$   freeze_base_model  s    zHubertForCTC.freeze_base_model)r   r  r  expected_outputexpected_lossr   r   r   r   r   labelsr   c              
   C   sf  |dk	r|n| j j}| j|||||d}|d }| |}| |}	d}
|dk	r"| | j jkrttd| j j |dk	r|ntj	|tj
d}| |dtj
}|dk}|d}||}tjj|	dtjddd}tjjjd	d
, tjj||||| j j| j j| j jd}
W 5 Q R X |sR|	f|td  }|
dk	rN|
f| S |S t|
|	|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nr   r   z$Label values must be <= vocab_size: r+   r&   )rs   r,   r   F)enabled)blankZ	reductionZzero_infinitylosslogitsr]   r   )rW   r!  r  r   r)  r   r'  r.   r   Z	ones_liker  r  r3   r   Zmasked_selectr   r   Zlog_softmaxZfloat32rh   backendsZcudnnflagsZctc_lossZpad_token_idZctc_loss_reductionZctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r]   r   )rV   r   r   r   r   r   r;  r   r]   r@  r?  rA   Zlabels_maskZtarget_lengthsZflattened_targetsZ	log_probsoutputr#   r#   r$   r^     sR    





   zHubertForCTC.forward)N)NNNNN)r`   ra   rb   r   r   rJ   r,  r2  r0  r7  r   r"  r   _CHECKPOINT_FOR_DOCr   r#  _CTC_EXPECTED_OUTPUT_CTC_EXPECTED_LOSSr   r   r8   r   r   r^   rc   r#   r#   rY   r$   r$  j  s6   
     
r$  z
    Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                       s   e Zd Z fddZdd Zdd Zdd Zeee	e
eed	eed
deej eej ee ee ee eej eeef dddZ  ZS )HubertForSequenceClassificationc                    s   t  | t|dr$|jr$tdt|| _|jd }|jrTt	
t|| | _t	|j|j| _t	|j|j| _|   d S )Nr&  z]Sequence classification does not support the use of Hubert adapters (config.add_adapter=True)r   )rI   rJ   rw   r&  r.   r  r  r   use_weighted_layer_sumr   r  r   r<   layer_weightsr   rt   Zclassifier_proj_size	projector
num_labels
classifierr  )rV   rW   Z
num_layersrY   r#   r$   rJ     s    

z(HubertForSequenceClassification.__init__c                 C   s   t dt |   dS )z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        r.  Nr/  r1  r#   r#   r$   r2  !  s
    z8HubertForSequenceClassification.freeze_feature_extractorc                 C   s   | j j  dS r3  r4  r1  r#   r#   r$   r0  -  s    z6HubertForSequenceClassification.freeze_feature_encoderc                 C   s   | j  D ]
}d|_q
dS r5  r6  r   r#   r#   r$   r7  4  s    z1HubertForSequenceClassification.freeze_base_modelZaudio)r   r  r  Zmodalityr8  r9  Nr:  c                 C   sf  |dk	r|n| j j}| j jr dn|}| j|||||d}| j jr|t }tj|dd}tjj	| j
dd}	||	ddd jdd}n|d }| |}|dkr|jdd}
n<| |jd |}d|| < |jdd|jdddd }
| |
}d}|dk	r"t }||d| j j|d}|sR|f|td  }|dk	rN|f| S |S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr   r   r   r&   r   r   r>  )rW   r!  rI  r  rC  r   stackr   r   r   rJ  r   r3   rK  r  r  r   rM  r   rL  r   r]   r   )rV   r   r   r   r   r   r;  r   r]   Znorm_weightsZpooled_outputZpadding_maskr@  r?  Zloss_fctrD  r#   r#   r$   r^   <  sF    

 

z'HubertForSequenceClassification.forward)NNNNN)r`   ra   rb   rJ   r2  r0  r7  r   r"  r   _SEQ_CLASS_CHECKPOINTr   r#  _SEQ_CLASS_EXPECTED_OUTPUT_SEQ_CLASS_EXPECTED_LOSSr   r   r   r8   r   r   r^   rc   r#   r#   rY   r$   rH    s6   	     
rH  )Nr   )Gr   r   typingr   r   r   numpyr/   r   Ztorch.utils.checkpointr   Ztorch.nnr   Zactivationsr   Zintegrations.deepspeedr	   Zmodeling_outputsr
   r   r   Zmodeling_utilsr   rv   r   r   r   r   r   Zconfiguration_hubertr   Z
get_loggerr`   r*  rC  r#  rE  Z_EXPECTED_OUTPUT_SHAPErF  rG  rO  rP  rQ  Z$HUBERT_PRETRAINED_MODEL_ARCHIVE_LISTr   r   r  ZndarrayrC   ModulerD   rd   ri   rk   r}   r   r   r   r   r   r   r   r   r   r   r   ZHUBERT_START_DOCSTRINGr"  r  r$  rH  r#   r#   r#   r$   <module>   s   

  
x(3 #.X[D&  