U
    ,-es                    @   s  d Z ddlZddlmZmZmZmZ ddlZddl	Z	ddl	m
Z
 ddlmZ ddlmZmZ ddlmZmZ dd	lmZmZ dd
lmZmZmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z&m'Z' e(e)Z*dZ+dZ,ddgZ-G dd de
j.Z/G dd de
j.Z0G dd de
j.Z1G dd de
j.Z2G dd deZ3dZ4dZ5dZ6d Z7G d!d" d"e3Z8ed#e4j9d$d%G d&d' d'e8Z:ed(e4j9d)d%G d*d+ d+e8Z;ed,e4j9d-d%G d.d/ d/e3Z<ed0e5G d1d2 d2e3Z=dS )3z PyTorch BARK model.    N)DictOptionalTupleUnion)nn)
functional   )#AlternatingCodebooksLogitsProcessorSuppressTokensLogitsProcessor)CausalLMOutputWithPastMaskedLMOutput)PreTrainedModelget_parameter_device)add_start_docstrings%add_start_docstrings_to_model_forwardis_accelerate_availablelogging   )	AutoModel   )BarkCoarseConfig
BarkConfigBarkFineConfigBarkSemanticConfigBarkSubModelConfig)BarkCoarseGenerationConfigBarkFineGenerationConfigBarkSemanticGenerationConfigzsuno/bark-smallr   z	suno/barkc                       sB   e Zd Zd fdd	Zdd Zdd Zdd	d
ZdddZ  ZS )BarkSelfAttentionFc                    s   t    |j| _t|j| _t|j| _|j| _|j	| _	| j| j	 | _
|j|j	 dkrvtd| j d| j	 dtj|jd|j |jd| _tj|j|j|jd| _|| _|r|j}ttj||ftddd||}| d	| d S )
Nr   z;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   biasdtyper   r    )super__init__dropoutr   Dropoutattn_dropoutresid_dropouthidden_size	embed_dim	num_headshead_dim
ValueErrorLinearr    att_projout_proj	is_causal
block_sizetorchZtrilonesboolviewZregister_buffer)selfconfigr1   r2   r    	__class__ g/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/bark/modeling_bark.pyr$   A   s$    
$zBarkSelfAttention.__init__c                 C   s2   |  dd ||f }||}|ddddS )zJ
        Splits hidden_size dim into attn_head_size and num_heads
        Nr   r   r   r   )sizer6   Zpermute)r7   tensorr+   attn_head_sizeZ	new_shaper;   r;   r<   _split_heads_   s    
zBarkSelfAttention._split_headsc                 C   s4   | dd }|| dd || f }|S )zS
        Merges attn_head_size dim and num_attn_heads dim into hidden_size
        r   r   N)	transpose
contiguousr6   r>   )r7   r?   r+   r@   r;   r;   r<   _merge_headsg   s     zBarkSelfAttention._merge_headsNc           
      C   s   t ||dddt| j  }| jr||d|d }}|| j	d d d d || |d |f dkt 
|jj}|d k	r|| }tjj|dd}||j}| |}|d k	r|| }t ||}	|	|fS )Nr=   rB         ?r   dim)r3   matmulrC   mathsqrtr,   r1   r>   masked_fillr    finfor"   minr   r   softmaxtor'   )
r7   querykeyvalueattention_mask	head_maskattn_weightsZquery_lengthZ
key_lengthattn_outputr;   r;   r<   _attns   s     $(
zBarkSelfAttention._attnc                 C   s   |  |j| jdd\}}}	| || j| j}| || j| j}| |	| j| j}	|d k	r|d }
|d }tj|
|fdd}tj||	fdd}	|dkr||	f}nd }| |||	||\}}| 	|| j| j}| 
|}| |}||f}|r||f7 }|S )Nr   rG   r   r   rB   T)r/   splitr*   rA   r+   r,   r3   catrX   rE   r0   r(   )r7   hidden_statesrT   past_key_valuesrU   	use_cacheoutput_attentionsrQ   rR   rS   Zpast_keyZ
past_valueZpresentrW   rV   outputsr;   r;   r<   forward   s(    




zBarkSelfAttention.forward)F)NN)NNNFF)	__name__
__module____qualname__r$   rA   rE   rX   r`   __classcell__r;   r;   r9   r<   r   =   s   
"     r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )BarkLayerNormzOLayerNorm but with an optional bias. PyTorch doesn't support simply bias=False.Tc                    s:   t    tt|| _|r0tt|nd | _d S N)	r#   r$   r   	Parameterr3   r4   weightZzerosr    )r7   r)   r    r9   r;   r<   r$      s    
zBarkLayerNorm.__init__c                 C   s   t j|| jj| j| jddS )Ngh㈵>)Zeps)FZ
layer_normrh   shaper    )r7   inputr;   r;   r<   r`      s    zBarkLayerNorm.forward)T)ra   rb   rc   __doc__r$   r`   rd   r;   r;   r9   r<   re      s   re   c                       s$   e Zd Z fddZdd Z  ZS )BarkMLPc                    s^   t    tj|jd|j |jd| _tjd|j |j|jd| _t|j	| _	t
 | _d S )N   r   )r#   r$   r   r.   r)   r    in_projr0   r&   r%   ZGELUgelur7   r8   r9   r;   r<   r$      s
    
zBarkMLP.__init__c                 C   s,   |  |}| |}| |}| |}|S rf   )ro   rp   r0   r%   )r7   r[   r;   r;   r<   r`      s
    



zBarkMLP.forwardra   rb   rc   r$   r`   rd   r;   r;   r9   r<   rm      s   rm   c                       s(   e Zd Zd fdd	ZdddZ  ZS )		BarkBlockFc                    sl   t    |r4t|j|jd| _t|j|jd| _nt|j| _t|j| _t	||d| _
t|| _d S )Nr   r1   )r#   r$   re   r)   r    layernorm_1layernorm_2r   	LayerNormr   attnrm   mlp)r7   r8   r1   r9   r;   r<   r$      s    
zBarkBlock.__init__Nc                 C   sv   |  |}| j||||||d}|d }	|dd  }
||	 }|| | | }|r`|f|
 }
n|f|
dd   }
|
S )Nr\   rT   rU   r]   r^   r   r   )ru   rx   ry   rv   )r7   r[   r\   rT   rU   r]   r^   Zintermediary_hidden_statesZattn_outputsrW   r_   r;   r;   r<   r`      s&    	
	zBarkBlock.forward)F)NNNFFrr   r;   r;   r9   r<   rs      s        rs   c                       sN   e Zd ZdZeZdZdd Z fddZe	e
jddd	Zdd
dZ  ZS )BarkPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    Fc                 C   s   t |tjfr<|jjjd| jjd |jdk	r|jj	  nft |tj
r||jjjd| jjd |jdk	r|jj|j 	  n&t |tjr|jj	  |jjd dS )zInitialize the weights.g        )ZmeanZstdNrF   )
isinstancer   r.   rh   dataZnormal_r8   Zinitializer_ranger    Zzero_	EmbeddingZpadding_idxrw   Zfill_r7   moduler;   r;   r<   _init_weights  s    

z!BarkPreTrainedModel._init_weightsc                    s   t  j|| d S rf   )r#   r$   )r7   inputskwargsr9   r;   r<   r$   %  s    zBarkPreTrainedModel.__init__returnc                 C   s\   t | dst| S |  D ]8}t |drt |jdr|jjdk	rt|jj  S qt| S z
        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
        device).
        _hf_hookexecution_deviceN)hasattrr   modulesr   r   r3   devicer   r;   r;   r<   r   (  s    


zBarkPreTrainedModel.devicec                 C   s(   t |tst |tst |tr$||_d S rf   )r|   BarkCausalModelBarkFineModel	BarkModelgradient_checkpointing)r7   r   rS   r;   r;   r<   _set_gradient_checkpointing<  s    z/BarkPreTrainedModel._set_gradient_checkpointing)F)ra   rb   rc   rl   r   config_classZsupports_gradient_checkpointingr   r$   propertyr3   r   r   rd   r;   r;   r9   r<   r{     s   r{   aG  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`{config}`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aI  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`BarkConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a
  
    Args:
        codebook_idx (`int`):
            Index of the codebook that will be predicted.
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, number_of_codebooks)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it. Initially, indices of the first two codebooks are obtained from the `coarse` sub-model. The rest is
            predicted recursively by attending the previously predicted channels. The model predicts on windows of
            length 1024.
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): NOT IMPLEMENTED YET.
        input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If
            `past_key_values` is used, optionally only the last `input_embeds` have to be input (see
            `past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into
            associated vectors than the model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `input_ids` of shape `(batch_size, sequence_length)`.
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
            Here, due to `Bark` particularities, if `past_key_values` is used, `input_embeds` will be ignored and you
            have to use `input_ids`. If `past_key_values` is not used and `use_cache` is set to `True`, `input_embeds`
            is used in priority instead of `input_ids`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                       s   e Zd ZeZ fddZdd Zdd Zddd	Ze	e
deej eeej  eej eej eej eej eej ee ee ee ee eeej ef d
ddZeeeej  ejeeej  dddZ  ZS )r   c                    s   t     | _t j j| _t j j| _	t
 j| _t fddt jD | _t j jd| _tj j jdd| _d| _|   d S )Nc                    s   g | ]}t  d dqS )Trt   rs   .0_r8   r;   r<   
<listcomp>  s     z,BarkCausalModel.__init__.<locals>.<listcomp>r   F)r#   r$   r8   r   r~   input_vocab_sizer)   input_embeds_layerr2   position_embeds_layerr&   r%   drop
ModuleListrange
num_layerslayersre   r    layernorm_finalr.   output_vocab_sizelm_headr   	post_initrq   r9   r   r<   r$     s     zBarkCausalModel.__init__c                 C   s   | j S rf   r   r7   r;   r;   r<   get_input_embeddings  s    z$BarkCausalModel.get_input_embeddingsc                 C   s
   || _ d S rf   r   r7   Znew_embeddingsr;   r;   r<   set_input_embeddings  s    z$BarkCausalModel.set_input_embeddingsNc                 K   sD  | dd }| dd }| dd }|d k	rN|jd }|d d dgf }d }n(|d k	rl| drl|jd }n
|jd }|d k	r|d d d |f }|d k	r|d d d |f }|d k	r|d kr| dd }||dkd |r|d d df d}nd }|d k	r.| dr.d ||| d||dS ||| d||d	S )
Ninput_embedsrT   position_idsr   r=   r]   r   )	input_idsr   r\   r]   r   rT   )r   r\   r]   r   rT   )getrj   longZcumsummasked_fill_	unsqueeze)r7   r   r\   r   r   rT   r   Zseq_lenr;   r;   r<   prepare_inputs_for_generation  sD    

	z-BarkCausalModel.prepare_inputs_for_generation)r   r\   rT   r   rU   labelsr   r]   r^   output_hidden_statesreturn_dictr   c              	      sZ   d k	r n| j j |
d k	r |
n| j j}
d k	r4n| j j|d k	rH|n| j j}|d k	rj|d k	rjtdn8|d k	r||d kr|n&|d k	r| |}n|d k	rntd| d d }|jd }|d }|d k	r|j	n|j	}|d krd}t
d gt| j }n|d d d}|d kr8tj||| tj|d}|d}| |}|d k	r|dkr^td||d}|d d d d d d f }|j| jd}d	| t| jj }| || j j}| || }||df }| jr| jrrtd
 drdnd } rdnd }|
rdnd }tt| j|D ]\}\}}|
rN||f }| jr| jr fdd}tj j!!|||d ||| }n||||||  d}|d }r||d f } r2||rdnd f }q2| "|}||}|
r||f }| #|}d }|d k	r&t$d|sHt
dd d ||||fD S t%|||||dS )NCYou cannot specify both input_ids and input_embeds at the same time4You have to specify either input_ids or input_embedsr=   r   rB   r"   r   $batch_size has to be defined and > 0r!   rF   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr;   c                    s    fdd}|S )Nc                     s    | f S rf   r;   )r   )r   r^   r]   r;   r<   custom_forwardy  s    zNBarkCausalModel.forward.<locals>.create_custom_forward.<locals>.custom_forwardr;   )r   r   r^   r]   )r   r<   create_custom_forwardx  s    z6BarkCausalModel.forward.<locals>.create_custom_forwardrz   r   r   zXTraining is not implemented yet for Bark - ensure you do not pass `labels` to the model.c                 s   s   | ]}|d k	r|V  qd S rf   r;   r   vr;   r;   r<   	<genexpr>  s     z*BarkCausalModel.forward.<locals>.<genexpr>)losslogitsr\   r[   
attentions)&r8   r^   r   r]   use_return_dictr-   r   r>   rj   r   tuplelenr   r3   aranger   r   r   r6   rP   r"   rM   rN   get_head_maskr   r   r   ZtrainingloggerZwarning_once	enumerateziputils
checkpointr   r   NotImplementedErrorr   )r7   r   r\   rT   r   rU   r   r   r]   r^   r   r   input_shape
batch_size
seq_lengthr   Zpast_lengthposition_embedsr[   output_shapeZpresent_key_valuesall_self_attentionsall_hidden_statesiblockZpast_layer_key_valuesr   r_   r   r   r;   r   r<   r`     s    







	




zBarkCausalModel.forward)r\   beam_idxr   c                    s   t  fdd| D S )a  
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        c                 3   s$   | ]}t  fd d|D V  qdS )c                 3   s"   | ]}| d  |jV  qdS )r   N)Zindex_selectrP   r   )r   Z
past_stater   r;   r<   r     s     z;BarkCausalModel._reorder_cache.<locals>.<genexpr>.<genexpr>Nr   )r   Z
layer_pastr   r;   r<   r     s   z1BarkCausalModel._reorder_cache.<locals>.<genexpr>r   )r\   r   r;   r   r<   _reorder_cache  s    
zBarkCausalModel._reorder_cache)N)NNNNNNNNNNN)ra   rb   rc   r   r   r$   r   r   r   r   "BARK_CAUSAL_MODEL_INPUTS_DOCSTRINGr   r3   Tensorr   ZFloatTensor
LongTensorr5   r   r   r`   staticmethodr   rd   r;   r;   r9   r<   r     sJ   
4            " r   zBark semantic (or text) model. It shares the same architecture as the coarse model.
    It is a GPT-2 like autoregressive model with a language modeling head on top.r   r   c                       sL   e Zd ZdZeZdejee	e
eejf  e	ej ejd fddZ  ZS )BarkSemanticModelsemanticN)r   semantic_generation_confighistory_promptrT   r   c              	      s  |dkrt d|jd }|j}||j }|dk	rH|d|  |j}|dk	r|d | d }tjj	|d|t
| f|jdd}n tj|jg| tjd| j}tj|d |dd	}tj|jgg| tjd| j}	tj| |ddd|f | |ddd|d f  | |	gdd	}
tt|j|j}|tt|jd | jj t|}t jtj||d ftjd| jf|
|g|d
|}|dd|d df }|S )a  
        Generates text semantic tokens from an input prompt and an additional optional `Bark` speaker prompt.

        Args:
            input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
                Input ids, i.e tokenized input sentences. Will be truncated up to
                semantic_generation_config.max_input_semantic_length tokens. Note that the output audios will be as
                long as the longest generation among the batch.
            semantic_generation_config (`BarkSemanticGenerationConfig`):
                Generation config indicating how to generate the semantic tokens.
            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
                Optional `Bark` speaker prompt.
            attention_mask (`Optional[torch.Tensor]`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
        Returns:
            torch.LongTensor: Output semantic tokens.
        N/`semantic_generation_config` has to be providedr   r   semantic_promptconstant)rS   moder!   rG   )r   logits_processorgeneration_config)r-   rj   max_input_semantic_lengthZtext_encoding_offsetrL   r5   Ztext_pad_tokenr   r   padr   semantic_pad_tokenr3   r?   intrP   r   repeat_interleaveZsemantic_infer_tokenrZ   r   listr   semantic_vocab_sizeextendr8   r   r
   r#   generater4   )r7   r   r   r   rT   r   r   r   Zsemantic_historyZinfer_arrayr   Ztokens_to_suppressZ suppress_tokens_logits_processorsemantic_outputr9   r;   r<   r     sl    


  		zBarkSemanticModel.generate)NNN)ra   rb   rc   base_model_prefixr   r   r3   r   r   r   r   strr   r   rd   r;   r;   r9   r<   r     s      r   zBark coarse acoustics model.
    It shares the same architecture as the semantic (or text) model. It is a GPT-2 like autoregressive model with a
    language modeling head on top.r   c                
       sp   e Zd ZdZeZd
eeeeeeee	e
jf  dddZde
jeeeeee	e
jf  e
jd fdd	Z  ZS )BarkCoarseModelcoarse_acousticsN)max_coarse_historysemantic_to_coarse_ratior   r   codebook_sizer   c                 C   s  |dk	r<t j|d d |dd}|d  }|dk	rhtd|jd D ] }	||	ddf  ||	 7  < qFt |ddd}||j }t j|d |dd}tt	
|| }
t|
|jd |jd d  tt	
|jd | g}tt|| }|dd| df  }|dd| df  }|dddd	f }n<t jg g| t jd
| j}t jg g| t jd
| j}||fS )a  
        Preprocess the optional `Bark` speaker prompts before `self.generate`.

        Args:
            max_coarse_history (`int`):
                Maximum size of coarse tokens used.
            semantic_to_coarse_ratio (`int`):
                Ratio of semantic to coarse frequency
            batch_size (`int`):
                Batch size, i.e the number of samples.
            semantic_generation_config (`BarkSemanticGenerationConfig`):
                Generation config indicating how to generate the semantic tokens.
            codebook_size (`int`):
                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
            history_prompt (`Optional[Dict[str,torch.Tensor]]`):
                Optional `Bark` speaker prompt.
        Returns: Returns:
            `tuple(torch.FloatTensor)`:
            - **x_semantic_history** (`torch.FloatTensor` -- Processed semantic speaker prompt.
            - **x_coarse_history** (`torch.FloatTensor`) -- Processed coarse speaker prompt.
        Nr   r   rG   Zcoarse_promptr   r=   r   rB   r!   )r3   r   cloner   rj   rC   r6   r   r   npfloorrN   roundr?   rP   r   )r7   r   r   r   r   r   r   x_semantic_historyZx_coarse_historynmax_semantic_historyZn_semantic_hist_providedZn_coarse_hist_providedr;   r;   r<   preprocess_histories7  s.    

z$BarkCoarseModel.preprocess_histories   )r   r   coarse_generation_configr   r   r   c              	      s2  |dkrt d|dkr t d|j}|j}|j}	|||jk|j |j|j |j	 }
t
t||
 }t|jd |
 |j	 }t
t||j	 }|jd }| j|||
|||d\}}|jd }t||g}t
t||	 }d}|jd }t|D ]}|t
t||
  }|ddtd|| gdf }|ddd|f }t|d||jd  fd|j}t|t|jgg| | j|dd| df g}t|jd |j|}t j|f|gt|	|| |d	|}|jd }t||dd|df g}|jd | }~q|dd|df }|S )
a  
        Generates coarse acoustics tokens from input text semantic tokens and an additional optional `Bark` speaker
        prompt.

        Args:
            semantic_output (`torch.Tensor` of shape (batch_size, seq_len), *optional*):
                Input text semantic ids, i.e the output of `BarkSemanticModel.generate`.
            semantic_generation_config (`BarkSemanticGenerationConfig`):
                Generation config indicating how to generate the semantic tokens.
            coarse_generation_config (`BarkCoarseGenerationConfig`):
                Generation config indicating how to generate the coarse tokens.
            codebook_size (`int`, *optional*, defaults to 1024):
                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
                Optional `Bark` speaker prompt.
        Returns:
            torch.LongTensor: Output coarse acoustics tokens.
        Nr   -`coarse_generation_config` has to be providedr   r   )r   r   r   r   r   r   r=   r   )r   Zmax_new_tokensr   ) r-   max_coarse_input_lengthr   sliding_window_lenr   r   Zcoarse_semantic_pad_tokenZcoarse_rate_hzZsemantic_rate_hzn_coarse_codebooksr   r   r   rj   r   r   r3   Zhstackceilr   maxri   r   r?   Zcoarse_infer_tokenrP   r   r	   r   r#   r   rN   )r7   r   r   r  r   r   r   r  r   r  r   r   Zmax_generated_lenr   r   Zx_coarseZbase_semantic_idxZn_window_stepsZtotal_generated_lenZlen_coarse_historyr   Zsemantic_idxZinput_coarseZalternatingLogitsProcessorZoutput_coarseZinput_coarse_lencoarse_outputr9   r;   r<   r     s    



"
zBarkCoarseModel.generate)N)NNr  N)ra   rb   rc   r   r   r   r   r   r   r   r3   r   r   r   r   r   r   rd   r;   r;   r9   r<   r   -  s.   	 M    r   zBark fine acoustics model. It is a non-causal GPT-like model with `config.n_codes_total` embedding layers and
    language modeling heads, one for each codebook.r   c                       s  e Zd ZdZeZdZ fddZdd Zdd Z	d	d
 Z
dd ZdddZdee ee ejdddZdd Zeedeeej eej eej eej eej eej ee ee ee eeej ef dddZdejeeeeee e!ejf  ejdddZ"  Z#S ) r   fine_acousticscodebook_idxc                    s   t     | _t fddt jD | _t j	 j
| _t j| _t fddt jD | _t j
| _t fddt j jD | _d| _ j| _|   d S )Nc                    s   g | ]}t  j jqS r;   )r   r~   r   r)   r   r   r;   r<   r     s     z*BarkFineModel.__init__.<locals>.<listcomp>c                    s   g | ]}t  d dqS )Frt   r   r   r   r;   r<   r     s     c                    s    g | ]}t j j jd dqS )Fr   )r   r.   r)   r   r   r   r;   r<   r     s   F)r#   r$   r8   r   r   r   n_codes_totalinput_embeds_layersr~   r2   r)   r   r&   r%   r   r   r   rw   r   n_codes_givenlm_headsr   r   rq   r9   r   r<   r$   
  s"     
zBarkFineModel.__init__c                 C   s   | j S rf   r  r   r;   r;   r<   r   (  s    z"BarkFineModel.get_input_embeddingsc                 C   s
   || _ d S rf   r  r   r;   r;   r<   r   ,  s    z"BarkFineModel.set_input_embeddingsc                 C   s   | j S rf   r  r   r;   r;   r<   get_output_embeddings0  s    z#BarkFineModel.get_output_embeddingsc                 C   s
   || _ d S rf   r  )r7   Znew_output_embeddingsr;   r;   r<   set_output_embeddings4  s    z#BarkFineModel.set_output_embeddingsNc                    s     }t fdd|D }| |d jjd   d k	r~jjs~ }t fdd|D }	|   S )Nc                    s   g | ]} | qS r;   )Z_get_resized_embeddings)r   Zold_embeddingsnew_num_tokenspad_to_multiple_ofr7   r;   r<   r   ;  s   z:BarkFineModel._resize_token_embeddings.<locals>.<listcomp>r   c                    s   g | ]} | qS r;   )Z_get_resized_lm_head)r   Zold_lm_head)r  r7   r;   r<   r   G  s     )
r   r   r   r   rh   rj   r  r8   tie_word_embeddingsr  )r7   r  r  Zold_embeddings_listZnew_embeddings_listZold_lm_head_listZnew_lm_head_listr;   r  r<   _resize_token_embeddings8  s    

z&BarkFineModel._resize_token_embeddings)r  r  r   c                 C   sx   |  ||}|dkr |dkr |S |d jjd | j_|d jjd | j_|d jjd | _|d jjd | _|   |S )aX  
        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.

        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.

        Arguments:
            new_num_tokens (`int`, *optional*):
                The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
                returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the embedding matrix to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
                details about this, or help on choosing the correct value for resizing, refer to this guide:
                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc

        Return:
            `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
        Nr   )r  rh   rj   r8   r   Z
vocab_sizetie_weights)r7   r  r  Zmodel_embedsr;   r;   r<   resize_token_embeddingsM  s    z%BarkFineModel.resize_token_embeddingsc                 C   s   t | jddrjg | _|  }|  }t| jj| jj D ]0}| || ||d   | j	d| d q8| 
 D ]}t|drr|  qrdS )z
        Tie the weights between the input embeddings list and the output embeddings list.

        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the
        weights instead.
        r  Tr   z	lm_heads.z.weight_tie_weightsN)getattrr8   Z_tied_weights_keysr  r   r   r  r  Z_tie_or_clone_weightsappendr   r   r  )r7   Zoutput_embeddingsZinput_embeddingsr   r   r;   r;   r<   r  t  s    
zBarkFineModel.tie_weights)r  r   rT   r   rU   r   r   r^   r   r   r   c                    s  |d k	r|n| j j}|	d k	r |	n| j j}	|
d k	r4|
n| j j}
|dkrLtd d k	rd|d k	rdtd d kr||d kr|td d k	rև fddt| jD }tj|dd}|d d d d d d d |d	 f j	dd}|
 d d }|jd }|d	 } d k	r jn|j}|d kr6tjd|tj|d
}|d}| |}|d k	r|dkr\td||d}|d d d d d d f }|j| jd}d| t| jj }| || j j}| || }||
df }|rdnd }|	rdnd }t| jD ]J\}}|	r||f }||||| |d}|d }|r||d	 f }q| |}||}|	rh||f }| j|| j j  |}d }|d k	rtd|
stdd d |||fD S t||||dS )Nr   zRCannot predict 0th codebook - 0th codebook should be predicted by the coarse modelr   r   c                    s0   g | ](\}}| d d d d |f  dqS )Nr=   )r   )r   r   r   r   r;   r<   r     s   z)BarkFineModel.forward.<locals>.<listcomp>r=   rG   r   r   r   r!   rF   r;   )rT   rU   r^   zTraining is not implemented yetc                 s   s   | ]}|d k	r|V  qd S rf   r;   r   r;   r;   r<   r     s      z(BarkFineModel.forward.<locals>.<genexpr>)r   r   r[   r   ) r8   r^   r   r   r-   r   r  r3   rZ   sumr>   rj   r   r   r   r   r   r6   rP   r"   rM   rN   r   r   r   r   r   r  r  r   r   r   )r7   r  r   rT   r   rU   r   r   r^   r   r   r   r   r   r   r   r[   r   r   r   r   r   r_   r   r   r;   r  r<   r`     s    
,










zBarkFineModel.forwardr  )r	  r   r  fine_generation_configr   r   r   c              	   K   s  |dkrt d|dkr t d|dkr0t d|d|j}|j}	|j}
||jd d|j}t	||j
 |}|jd }|dk	rtj|d jd |dd	}nd}|j}t|d|j| fd
|}|dk	rtj|dd|	 dddf |gdd	}|dd|	 dddf jd }nd}d}|jd |
k r\|
|jd  }tj|ddd|fd
|d}|jd |
|  |	 }tt|}td|d }t|D ]}t||	 |jd |
 g}t|||	  |jd |	 g}|| }|dd|||
 ddf }t||jD ]}| ||j}|dks,|dkrT|dd|dd|f }t|d}n`|ddddd|f | }tj|dd	dd||
f }|d|f}tj|dd|d}|tj}||dd|d|f< ~~qt||jD ]6}|dd|d|f |dd|||
|  |f< q~q|dddddd|df }|dkrp|ddddd| f }|jd |jd krt d|S )ap  
        Generates fine acoustics tokens from input coarse acoustics tokens and an additional optional `Bark` speaker
        prompt.

        Args:
            coarse_output (`torch.Tensor` of shape (batch_size, seq_len)):
                Input coarse acoustics ids, i.e the output of `BarkCoarseModel.generate`.
            semantic_generation_config (`BarkSemanticGenerationConfig`):
                Generation config indicating how to generate the semantic tokens.
            coarse_generation_config (`BarkCoarseGenerationConfig`):
                Generation config indicating how to generate the coarse tokens.
            fine_generation_config (`BarkFineGenerationConfig`):
                Generation config indicating how to generate the fine tokens.
            codebook_size (`int`, *optional*, defaults to 1024):
                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
                Optional `Bark` speaker prompt.
        Returns:
            torch.LongTensor: Output fine acoustics tokens.
        Nr   r  z+`fine_generation_config` has to be providedtemperaturer   r=   Zfine_promptrG   r   r   )r   rS   rF   )Znum_samplesr   rB   z-input and output should have the same seq_len) r-   r   r!  max_fine_history_lengthmax_fine_input_lengthr6   rj   r  r3   	remainderr   r   Tri   r   Zn_fine_codebooksrZ   r   r   r  r  r   rN   r`   r   ZargmaxrO   ZreshapeZmultinomialrP   Zint32rC   )r7   r	  r   r  r   r   r   r   r!  r"  r#  r   Zx_fine_historyZn_coarseZ
fine_inputZ	n_historyZn_remove_from_endZn_loopsZn_outerZ	start_idxZstart_fill_idxZrel_start_fill_idxZinput_bufferZn_innerr   Zrelevant_logitsZcodebook_predsZprobsr;   r;   r<   r     s    

*$ 
"
zBarkFineModel.generate)N)NN)	NNNNNNNNN)NNNr  N)$ra   rb   rc   r   r   r   Zmain_input_namer$   r   r   r  r  r  r   r   r   r~   r  r  r   BARK_FINE_INPUTS_DOCSTRINGr3   r   r   r5   r   r   r   r`   r   r   r   r   r   r   rd   r;   r;   r9   r<   r      sl   
    '         o     r   a6  
    The full Bark model, a text-to-speech model composed of 4 sub-models:
    - [`BarkSemanticModel`] (also referred to as the 'text' model): a causal auto-regressive transformer model that
      takes
    as input tokenized text, and predicts semantic text tokens that capture the meaning of the text.
    - [`BarkCoarseModel`] (also refered to as the 'coarse acoustics' model), also a causal autoregressive transformer,
    that takes into input the results of the last model. It aims at regressing the first two audio codebooks necessary
    to `encodec`.
    - [`BarkFineModel`] (the 'fine acoustics' model), this time a non-causal autoencoder transformer, which iteratively
    predicts the last codebooks based on the sum of the previous codebooks embeddings.
    - having predicted all the codebook channels from the [`EncodecModel`], Bark uses it to decode the output audio
      array.

    It should be noted that each of the first three modules can support conditional speaker embeddings to condition the
    output sound according to specific predefined voice.
    c                       s   e Zd ZeZ fddZeejdddZde	e
 ddd	Zd
d Ze de	ej e	eeejf  ejdddZ  ZS )r   c                    sH   t  | t|j| _t|j| _t|j	| _
t|j| _|| _d S rf   )r#   r$   r   semantic_configr   r   coarse_acoustics_configr   r   fine_acoustics_configr
  r   from_configZcodec_configcodec_modelr8   rq   r9   r;   r<   r$     s    zBarkModel.__init__r   c                 C   s\   t | jdst| S | j D ]8}t |drt |jdr|jjdk	rt|jj  S qdS r   )r   r   r   r   r   r   r3   r   r   r;   r;   r<   r     s    

zBarkModel.devicer   )gpu_idc                 C   s   t  rddlm} ntdtd| }| jjdkrL| d tj	  || j
j|\| j
_}d}| j
| j| jfD ]}||||d\}}qv|| _|| j||d\}}|| _dS )a  
        Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
        method moves one whole sub-model at a time to the GPU when it is used, and the sub-model remains in GPU until
        the next sub-model runs.

        Args:
            gpu_id (`int`, *optional*, defaults to 0):
                GPU id on which the sub-models will be loaded and offloaded.
        r   )cpu_offload_with_hookz1`enable_model_cpu_offload` requires `accelerate`.zcuda:cpuN)Zprev_module_hook)r   Z
accelerater-  ImportErrorr3   r   typerP   cudaZempty_cacher   r   r   r
  fine_acoustics_hookr+  codec_model_hook)r7   r,  r-  r   r   hookZcpu_offloaded_modelr;   r;   r<   enable_cpu_offload  s"    


zBarkModel.enable_cpu_offloadc                 C   s4   | dd}| jj|}| j|}|d}|S )z:Turn quantized audio codes into audio array using encodec.r   r   )rC   r+  Z	quantizerdecodedecoderZsqueeze)r7   Zfine_outputZemboutZ	audio_arrr;   r;   r<   codec_decode  s
    
zBarkModel.codec_decodeN)r   r   r   c                 K   s  t f | jj}tf | jj}tf | jj}d|ddi}i }i }	| D ]\}
}|
	drv|
t
dd }
|||
< qJ|
	dr|
t
dd }
|||
< qJ|
	dr|
t
dd }
||	|
< qJ|
|kr|||
< |
|kr|||
< |
|	krJ||	|
< qJ| jj|f||d|}| jj|f|||| jjd|}| jj|f||||| jjd|	}t| d	ddk	r|| j  | j| j| _| |}t| d
ddk	r| j  |S )aB  
        Generates audio from an input prompt and an additional optional `Bark` speaker prompt.

        Args:
            input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
                Input ids. Will be truncated up to 256 tokens. Note that the output audios will be as long as the
                longest generation among the batch.
            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
                Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch.
            kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:

                - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
                - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
                semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.

                This means you can, for example, specify a generation strategy for all sub-models except one.
        Returns:
            torch.LongTensor: Output generated audio.

        Example:

        ```python
        >>> from transformers import AutoProcessor, BarkModel

        >>> processor = AutoProcessor.from_pretrained("suno/bark-small")
        >>> model = BarkModel.from_pretrained("suno/bark-small")

        >>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)`
        >>> voice_preset = "v2/en_speaker_6"

        >>> inputs = processor("Hello, my dog is cute, I need him in my life", voice_preset=voice_preset)

        >>> audio_array = model.generate(**inputs, semantic_max_new_tokens=100)
        >>> audio_array = audio_array.cpu().numpy().squeeze()
        ```
        rT   NZ	semantic_Zcoarse_Zfine_)r   r   )r   r   r  r   )r   r   r  r   r   r2  r3  )r   r   r'  r   r(  r   r)  popitems
startswithr   r   r   r   r   r
  r  r2  Zoffloadr+  rP   r   r9  r3  )r7   r   r   r   r   r  r   Zkwargs_semanticZkwargs_coarseZkwargs_finerR   rS   r   r	  outputZaudior;   r;   r<   r     sx    - 












zBarkModel.generate)r   )NN)ra   rb   rc   r   r   r$   r   r3   r   r   r   r5  r9  Zno_gradr   r   r   r   r   rd   r;   r;   r9   r<   r     s   '
  r   )>rl   rJ   typingr   r   r   r   numpyr   r3   r   Ztorch.nnr   ri   Zgeneration.logits_processr	   r
   Zmodeling_outputsr   r   Zmodeling_utilsr   r   r   r   r   r   r   autor   Zconfiguration_barkr   r   r   r   r   Zgeneration_configuration_barkr   r   r   Z
get_loggerra   r   Z_CHECKPOINT_FOR_DOCZ_CONFIG_FOR_DOCZ"BARK_PRETRAINED_MODEL_ARCHIVE_LISTModuler   re   rm   rs   r{   ZBARK_MODEL_START_DOCSTRINGZBARK_START_DOCSTRINGr&  r   r   formatr   r   r   r   r;   r;   r;   r<   <module>   sl   
}65*5  
c
 N
  |