U
    ,È-eeÓ ã                   @   s²  d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
m  mZ ddl	mZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZ e e ¡Z!ddgZ"dde#dƒ fdd„Z$dd„ Z%dd„ Z&dd„ Z'dd„ Z(dd„ Z)G dd„ dej*ƒZ+G dd „ d ej*ƒZ,G d!d"„ d"ej*ƒZ-G d#d$„ d$ej*ƒZ.G d%d&„ d&ej*ƒZ/G d'd(„ d(ej*ƒZ0G d)d*„ d*ej*ƒZ1G d+d,„ d,ej*ƒZ2G d-d.„ d.ej*ƒZ3d/Z4ed0e4ƒG d1d2„ d2eƒƒZ5G d3d4„ d4ej*ƒZ6G d5d6„ d6eƒZ7G d7d8„ d8ej*ƒZ8G d9d:„ d:ej*ƒZ9G d;d<„ d<ej*ƒZ:G d=d>„ d>ej*ƒZ;G d?d@„ d@ej*ƒZ<G dAdB„ dBej*ƒZ=G dCdD„ dDej*ƒZ>G dEdF„ dFej*ƒZ?G dGdH„ dHeƒZ@G dIdJ„ dJeƒZAdKZBedLe4ƒG dMdN„ dNeAƒƒZCdS )OzPyTorch Jukebox model.é    N)ÚListÚOptionalÚTuple)Únn)Ú	LayerNormé   )ÚACT2FN)ÚPreTrainedModel)Úadd_start_docstringsÚlogging)Útqdmé   )ÚATTENTION_PATTERNSÚJukeboxConfigÚJukeboxPriorConfigÚJukeboxVQVAEConfigzopenai/jukebox-1b-lyricszopenai/jukebox-5b-lyricsç        ZInfc           	      C   sØ   |   ¡ } t||  d¡ƒ}|dkrL| tj| |ddd ddd…f k }|| |< |dkrÔtj| ddd\}}tjtj|dddd}||k}|ddd…f   ¡ |dd	d…f< d|d
< tj	| tj
djd||d}|| |< | S )aì  
    Filter a distribution of logits using top-k and/or nucleus (top-p) filtering

    Args:
        logits (`torch.Tensor`):
            logits distribution shape (vocabulary size)
        top_k (`int`, *optional*, defaults to 0):
            When `top_k >0` keep only top key tokens with highest probability (top-k filtering).
        top_p (`int`, *optional*, defaults to 0):
            When `top_p>0.0` keep the top tokens with cumulative probability >= `top_p` (nucleus filtering).
    éÿÿÿÿr   ©Údim.Nr   T)Z
descendingr   r   ).r   ©Údtype)r   ÚindexÚsrc)ÚcloneÚminÚsizeÚtorchZtopkÚsortZcumsumÚFÚsoftmaxZ
zeros_likeÚboolÚscatter_)	ÚlogitsÚtop_kÚtop_pZfilter_valueZindices_to_removeZsorted_logitsZsorted_indicesZcumulative_probsZsorted_indices_to_remove© r&   úm/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/jukebox/modeling_jukebox.pyÚfilter_logits+   s$    $   ÿr(   c                 C   sê   | d } t | ƒ|k rdt tj|t | ƒ tjd | j¡| g¡}dg|t | ƒ  ttdt | ƒƒƒ }nvt	t | ƒ||d   | ƒ}t
t||d ƒt | ƒ|d  ƒ}| ||d  ||d  … }tt||d  ||d  ƒƒ}|jdd|fS )a  
    Extract only the relevant tokens based on the character position. A total of `max_n_lyric_tokens` tokens will be
    returned. If the provided token sequence is smaller, it will be padded, otherwise, only characters ranging from the
    midpoint - `max_n_lyric_tokens//2` to the midpoint + `max_n_lyric_tokens//2` will be returned. This *focuses* on
    the most relevant tokens (in time) for the sequence.

    Args:
        full_tokens (`List[int]`):
            List containing the token ids of the entire lyrics.
        total_length (`int`):
            Total expected length of the music (not all of it is generated, see duration), in samples.
        offset (`int`):
            Starting sample in the music. If the offset is greater than 0, the lyrics will be shifted take that into
            account
        duration (`int`):
            Expected duration of the generated music, in samples. The duration has to be smaller than the total length,
            which represent the overall length of the signal,
    r   r   r   ç       @é   r   )Úlenr   ÚcatÚzerosÚlongÚtoÚdeviceÚlistÚrangeÚintr   ÚmaxÚ	unsqueeze)Úfull_tokensZmax_n_lyric_tokensÚtotal_lengthÚoffsetÚdurationÚtokensÚindicesZmidpointr&   r&   r'   Úget_relevant_lyric_tokensQ   s    "ÿ& r<   c                 C   s@   g }t d| | | |ƒD ]"}|| | kr0| | }| |¡ q|S ©Nr   )r2   Úappend)r7   Ún_ctxÚ
hop_lengthZstartsÚstartr&   r&   r'   Ú
get_startss   s    rB   c           !      C   sB  |j d }|j}| | }|jd |jd  }}||k rr|| }	tj|tj||| |j|jdgdd}|jd }nd}	t|j	| d  |j ƒ}
|j
d |jd  }}|h}i }i }tt|||
ƒddD ]Ð}|| }|j|||jddd\}}tj||dd}tj||dd}g }t||ƒD ]H\}}|j|d d …||…f g ||d	}| |d d d …|f ¡ ~qtj|dd}~| ¡  ¡  ¡ }~|||< |||< qÈg }t|ƒD ]–}|dd
d …f }t |t|ƒd f¡}tt|||
ƒƒD ]6}|| }|| | }|| | } ||||…| f< qà|d ||	 …d d…f }| |¡ q¦|S )Nr   r   ©r   r0   r   z#Computing lyric to music alignment )ÚdescT)Úget_indicesr8   )Úget_attn_weightsr   r   )Úlevelsr?   Úshaper   r,   r-   r   r0   r3   Úhop_fractionZprior_alignment_headZprior_alignment_layerr   rB   Úget_metadataÚsample_lengthÚchunkÚzipÚforward_tokensr>   ÚfloatÚcpuÚnumpyr2   Únpr+   Úreversed)!Úmusic_tokensÚlabelsÚpriorÚconfigÚlevelr?   r:   Ú
batch_sizer7   Zpadding_lengthr@   Zalignment_headZalignment_layerZattn_layersZalignment_hopsZindices_hopsrA   ÚendÚmetadataZindices_hopZ	tokens_bsZmetadata_bsZw_hopsÚtokens_iÚ
metadata_iZw_hopÚweightsZalignment_hopÚ
alignmentsÚitemr6   Z	alignmentr;   r&   r&   r'   Úget_alignment}   sZ    
 ÿ"
ra   c           	      C   s°   t  |dd¡ ¡  ¡ }tt|jd ƒƒD ]‚}|d k	rŠt|ƒ|  ¡ \}}}| › d|› d|› d|› d|d d… › d|› }t 	||| ¡ q(t 	| › d|› d|› || ¡ q(d S )Nr   r   r   z/lvl_ú-é   z-sample-)
r   ÚclamprP   rQ   r1   r2   rH   ÚvaluesrR   Úsave)	ÚfnameZlvlÚmetasÚaudÚiZartistsZgenresZlyricsÚpathr&   r&   r'   Úsave_temp_audio¶   s    .rl   c           	      C   sü   | d ks|dkrd S |r || nt || dƒ}| dkrNtj|||d |¡} nž| dkrÎtj|||d ¡ } tj|||d ¡ } |  |||| ¡d d …d d…| | d …f } tjjj| ddd ¡  ||¡} n| d	krìtj|||d |¡} |  dd||¡S )
Nr   r   Úautoregressive©r0   Úsummaryr   ©r   r   r   r   )ÚvalueÚprime)	r4   r   ÚonesZtrilÚviewr   Ú
functionalÚpadÚ
contiguous)	ÚmaskÚquery_lengthZkey_value_lengthÚblocksÚspreadr0   ÚsampleÚsample_tr8   r&   r&   r'   Úget_maskÁ   s*    .ý
 úÿ	r~   c                       s$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )ÚJukeboxConv1Dc                    sH   t ƒ  ¡  || _|| _t ||¡}t |¡}t |¡| _	t |¡| _
d S ©N)ÚsuperÚ__init__Úinput_widthÚoutput_widthr   Úemptyr-   r   Ú	ParameterÚweightÚbias)Úselfrƒ   r„   r‡   rˆ   ©Ú	__class__r&   r'   r‚   Ý   s    

zJukeboxConv1D.__init__c              	   C   sR   |  ¡ d d… | jf˜}t | j |¡| d|  d¡¡| j |¡¡}|j|Ž }|S )Nr   )r   r„   r   Zaddmmrˆ   Útype_asrt   r‡   )r‰   Úhidden_statesZsize_outr&   r&   r'   Úforwardæ   s    

ý
zJukeboxConv1D.forward©Ú__name__Ú
__module__Ú__qualname__r‚   rŽ   Ú__classcell__r&   r&   rŠ   r'   r   Ü   s   	r   c                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )ÚJukeboxResConv1DBlockr   ç      ð?c                    s`   t ƒ  ¡  |j| }|j| }|}|| _t ¡ | _t ||dd||¡| _	t ||ddd¡| _
d S )Nr   r   r   )r   r‚   Zres_convolution_multiplierZres_dilation_growth_rateÚ	res_scaler   ZReLUÚ
activationÚConv1dÚconv1d_1Úconv1d_2)r‰   rW   Ú
conv_widthÚdepthr–   Ú
hidden_dimZdilationÚpaddingrŠ   r&   r'   r‚   ò   s    



zJukeboxResConv1DBlock.__init__c                 C   s:   |}|   |¡}|  |¡}|   |¡}|  |¡}|| j|  S r€   )r—   r™   rš   r–   )r‰   r   Ú	residualsr&   r&   r'   rŽ   ý   s    



zJukeboxResConv1DBlock.forward)r   r•   r   r&   r&   rŠ   r'   r”   ñ   s   r”   c                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )ÚJukeboxResnet1DFc           	         sŠ   t ƒ  ¡  |j| _|jsdndt |¡ }g }t|ƒD ]0}| jd krH|n|| j }| t	||||ƒ¡ q6|rz|d d d… }t
 |¡| _d S )Nr•   r   )r   r‚   Zres_dilation_cycleZdilation_cycleZconv_res_scaleÚmathÚsqrtr2   r>   r”   r   Ú
ModuleListÚresnet_block)	r‰   rW   r›   Zn_depthÚreverse_dilationr–   rz   rœ   Zblock_depthrŠ   r&   r'   r‚     s    
zJukeboxResnet1D.__init__c                 C   s   | j D ]}||ƒ}q|S r€   )r¤   ©r‰   r   Úblockr&   r&   r'   rŽ     s    

zJukeboxResnet1D.forward)Fr   r&   r&   rŠ   r'   r      s   r    c                       s$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )ÚJukeboxEncoderConvBlockc              
      s   t ƒ  ¡  g }|d }|d }	|dkrjt|ƒD ]:}
| t |
dkrF|n|||||	¡¡ | t|||ƒ¡ q.t ||jddd¡| _t 	|¡| _
d S )Nr*   r   r   r   )r   r‚   r2   r>   r   r˜   r    Ú	embed_dimÚproj_outr£   Údownsample_block)r‰   rW   r©   r   rœ   Údown_tÚstride_trz   Úfilter_tÚpad_trj   rŠ   r&   r'   r‚     s    
$z JukeboxEncoderConvBlock.__init__c                 C   s"   | j D ]}||ƒ}q|  |¡}|S r€   )r«   rª   r¦   r&   r&   r'   rŽ   (  s    


zJukeboxEncoderConvBlock.forwardr   r&   r&   rŠ   r'   r¨     s   r¨   c                       s$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )ÚJukeboxEncoderc              
      sn   t ƒ  ¡  || _t ¡ | _ttt| jƒƒ||ƒ}|D ]4\}}	}
| j 	t
||dkrV|jn|j|||	|
ƒ¡ q4d S r=   )r   r‚   rG   r   r£   Úlevel_blocksrM   r1   r2   r>   r¨   Úconv_input_shaper©   )r‰   rW   Úwidthrœ   rG   Údowns_tÚ	strides_tÚiteratorrj   r¬   r­   rŠ   r&   r'   r‚   0  s    

     ÿÿzJukeboxEncoder.__init__c                 C   s4   g }t | jƒD ] }| j| }||ƒ}| |¡ q|S r€   )r2   rG   r±   r>   )r‰   r   Zall_hidden_statesrX   Úlevel_blockr&   r&   r'   rŽ   =  s    
zJukeboxEncoder.forwardr   r&   r&   rŠ   r'   r°   /  s   r°   c                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )ÚJukeboxDecoderConvBockTc              
      s    || _ || _tƒ  ¡  g }|dkr|d }	|d }
t ||ddd¡| _t|ƒD ]@}| t	||||ƒ¡ | t 
|||d k r€|n||	||
¡¡ qNt |¡| _d S )Nr   r*   r   r   )r©   r   r   r‚   r   r˜   Úproj_inr2   r>   r    ZConvTranspose1dr£   Úupsample_block)r‰   rW   r©   r   rœ   r¬   r­   r¥   rz   r®   r¯   rj   rŠ   r&   r'   r‚   J  s(    
    ÿÿzJukeboxDecoderConvBock.__init__c                 C   s"   |   |¡}| jD ]}||ƒ}q|S r€   )r¹   rº   r¦   r&   r&   r'   rŽ   \  s    


zJukeboxDecoderConvBock.forward)Tr   r&   r&   rŠ   r'   r¸   I  s   r¸   c                       s&   e Zd Z‡ fdd„Zddd„Z‡  ZS )ÚJukeboxDecoderc           
   
      st   t ƒ  ¡  || _t ¡ | _ttt| jƒƒ||ƒD ]&\}}}	| j 	t
||j||||	ƒ¡ q0t |j|jddd¡| _d S )Nr   r   )r   r‚   rG   r   r£   r±   rM   r1   r2   r>   r¸   r©   r˜   r²   Úout)
r‰   rW   r   rœ   rG   r´   rµ   rX   r¬   r­   rŠ   r&   r'   r‚   d  s    

 ÿzJukeboxDecoder.__init__Tc                 C   sX   |d }t t| jƒƒD ]2}| j| }||ƒ}|dkr|r|||d   }q|  |¡}|S )Nr   r   r   )rS   r2   rG   r±   r¼   )r‰   r   Ú
all_levelsZhidden_staterX   r·   r&   r&   r'   rŽ   o  s    

zJukeboxDecoder.forward)Tr   r&   r&   rŠ   r'   r»   c  s   r»   c                       st   e Zd Zedœ‡ fdd„Zdd„ Zdd„ Zdd	„ Zd
d„ Zdd„ Z	dd„ Z
dd„ Zdd„ Zdd„ Zddd„Z‡  ZS )ÚJukeboxBottleneckBlock©rW   c                    sV   t ƒ  ¡  |j| _|j| _|j| _d| _d| _d | _	d | _
|  dt | j| j¡¡ d S )Nr•   FÚcodebook)r   r‚   Únb_discrete_codesr©   Úcodebook_widthZlmuÚmuÚ	thresholdÚinitÚcodebook_sumÚcodebook_elemZregister_bufferr   r-   )r‰   rW   rŠ   r&   r'   r‚     s    
zJukeboxBottleneckBlock.__init__c                 C   sV   |j \}}|| jk rR| j| d | }dt |¡ }| |d¡}|t |¡|  }|S )Nr   ç{®Gáz„?)rH   rÁ   rR   r¢   Úrepeatr   Z
randn_like)r‰   r   r   Zembed_widthZ	n_repeatsÚstdr&   r&   r'   Ú_tileŠ  s    

zJukeboxBottleneckBlock._tilec                 C   sT   | j }d| _|  |¡}|t |jd ¡ d |… | _| j| _tj|| jj	d| _
d S )NTr   rn   )rÁ   rÅ   rË   r   ÚrandpermrH   rÀ   rÆ   rs   r0   rÇ   )r‰   r   rÁ   Úcodesr&   r&   r'   Úinit_codebook“  s    
z$JukeboxBottleneckBlock.init_codebookc              	   C   sŽ  | j | j| j  }}}t ¡ Z tj||jd |jd}| d| 	d|jd ¡d¡ t 
||¡}|jdd}|  |¡}	|	t |	jd ¡ d |… }
| j}|| j d| |  | _|| j d| |  | _| j 	|d¡| jk ¡ }| j 	||¡| j 	|d¡ }|| d| |
  | _|t |¡ }t |t |d ¡ ¡ }|| jk ¡ }t |¡}t | j| ¡t t |j¡¡ }W 5 Q R X ||||dœS )	Nr   rn   r   r   r   r•   g:Œ0âŽyE>)ÚentropyÚ	used_currÚusageÚdk)rÃ   rÂ   rÁ   r   Úno_gradr-   rH   r0   r"   rt   ÚmatmulÚsumrË   rÌ   rÀ   rÆ   rÇ   rÄ   rO   ÚlogÚnormrR   r¢   Úprod)r‰   r   Úlatent_statesrÃ   rÂ   rÁ   Zlatent_states_onehotZ_codebook_sumZ_codebook_elemrÍ   Z_random_codebookZold_codebookrÑ   Z	norm_codeZ_codebook_probrÏ   rÐ   rÒ   r&   r&   r'   Úupdate_codebook›  s.    
 ÿ
,z&JukeboxBottleneckBlock.update_codebookc                 C   sò   |  ddd¡ ¡ }| d|jd ¡}|jd | jkr\t |t |¡ ¡t 	t 
|j¡¡ }nŽ|jd d| j krê|dd | j…f |d| jd …f  }}t |t |¡ ¡t 	t 
|j¡¡ t |t |¡ ¡t 	t 
|j¡¡  }|| }||fS )Nr   r*   r   r   .)Úpermuterw   rt   rH   rÂ   r   r×   ÚmeanrR   r¢   rØ   )r‰   r   ÚprenormÚx1Zx2r&   r&   r'   Ú
preprocess¹  s    (&$$ÿz!JukeboxBottleneckBlock.preprocessc                 C   s8   |\}}|  ||d¡ ddd¡ ¡ }|  ||¡}||fS )Nr   r   r*   r   )rt   rÛ   rw   )r‰   rÙ   Údequantised_statesZx_shaperY   Útimer&   r&   r'   ÚpostprocessÊ  s    z"JukeboxBottleneckBlock.postprocessc                 C   sf   | j  ¡ }tj|d ddddt ||¡  tj|d ddd }tj|dd\}}t |¡}||fS )Nr*   r   T©r   Zkeepdimr   r   )rÀ   Útr   rÕ   rÔ   r   rÜ   )r‰   rÙ   Zcodebook_weightsZdistanceZmin_distancerT   Úfitr&   r&   r'   ÚquantiseÐ  s    
ÿþÿ
zJukeboxBottleneckBlock.quantisec                 C   s   t  || j¡}|S r€   )r   Z	embeddingrÀ   )r‰   rT   rà   r&   r&   r'   Ú
dequantiseÜ  s    z!JukeboxBottleneckBlock.dequantisec                 C   s8   |j \}}}|  |¡\}}|  |¡\}}| ||¡}|S r€   )rH   rß   ræ   rt   )r‰   rÙ   ÚsamplesÚ_Úseq_lenrT   r&   r&   r'   Úencodeà  s
    zJukeboxBottleneckBlock.encodec                 C   s6   |j \}}|  |¡}| ||| j¡ ddd¡ ¡ }|S ©Nr   r*   r   )rH   rç   rt   rÂ   rÛ   rw   )r‰   rT   rè   rê   rà   r&   r&   r'   Údecodeí  s
    

ÿzJukeboxBottleneckBlock.decodeTc                 C   sÀ   |j \}}}|  |¡\}}|r.| js.|  |¡ |  |¡\}}|  |¡}	|rX|  ||¡}
ni }
t |	 	¡ | ¡d t
 |j ¡ }||	|  	¡  }	|  ||	||f¡\}}	||	|tf ||dœ|
—ŽfS )Nr*   )rå   Zpn)rH   rß   rÅ   rÎ   ræ   rç   rÚ   r   r×   ÚdetachrR   rØ   râ   Údict)r‰   r   rÚ   rè   ré   rê   rÝ   rT   rå   rà   Zupdate_metricsÚcommit_lossr&   r&   r'   rŽ   ù  s    


"zJukeboxBottleneckBlock.forward)T)r   r‘   r’   r   r‚   rË   rÎ   rÚ   rß   râ   ræ   rç   rë   rí   rŽ   r“   r&   r&   rŠ   r'   r¾   ~  s   	r¾   c                       s6   e Zd Z‡ fdd„Zdd„ Zddd„Zd	d
„ Z‡  ZS )ÚJukeboxBottleneckc                    s>   t ƒ  ¡  || _t ¡ | _t| jƒD ]}| j t|ƒ¡ q$d S r€   )	r   r‚   rG   r   r£   r±   r2   r>   r¾   )r‰   rW   rG   rX   rŠ   r&   r'   r‚     s
    

zJukeboxBottleneck.__init__c                 C   s   dd„ t | j|ƒD ƒ}|S )Nc                 S   s   g | ]\}}|  |¡‘qS r&   )rë   )Ú.0r·   r   r&   r&   r'   Ú
<listcomp>!  s    z,JukeboxBottleneck.encode.<locals>.<listcomp>)rM   r±   )r‰   Ú	raw_audiorT   r&   r&   r'   rë      s    
ÿzJukeboxBottleneck.encoder   Nc                 C   s0   |d kr| j }dd„ t| j||… |ƒD ƒ}|S )Nc                 S   s   g | ]\}}|  |¡‘qS r&   )rí   )rò   r·   Úzr&   r&   r'   ró   )  s    z,JukeboxBottleneck.decode.<locals>.<listcomp>)rG   rM   r±   )r‰   rT   Ústart_levelÚ	end_levelZquantised_audior&   r&   r'   rí   &  s    ÿzJukeboxBottleneck.decodec                 C   sš   g g g g f\}}}}t | jƒD ]n}| j| d  }|| }||| jd\}	}
}}| |	¡ | jsh|
 ¡ }
| |
¡ | |¡ | jr| |¡ q||||fS )Nr   )rÚ   )r2   rG   r±   Útrainingr>   rî   )r‰   Úinput_audiorT   Zquantised_statesÚcommit_lossesÚmetricsrX   r·   r   Úsampled_tokensZquantised_staterð   Zmetricr&   r&   r'   rŽ   .  s      ÿ


zJukeboxBottleneck.forward)r   N)r   r‘   r’   r‚   rë   rí   rŽ   r“   r&   r&   rŠ   r'   rñ     s   
rñ   a?  

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config (`JukeboxConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
zÞThe Hierarchical VQ-VAE model used in Jukebox. This model follows the Hierarchical VQVAE paper from [Will Williams, Sam
Ringer, Tom Ash, John Hughes, David MacLeod, Jamie Dougherty](https://arxiv.org/abs/2002.08111).

    c                       sˆ   e Zd ZeZdZdd„ Zedœ‡ fdd„Zdd	d
„Zde	j
dœdd„Zddd„Zddd„Zdd„ Ze	jee	j
e	j
f dœdd„Z‡  ZS )ÚJukeboxVQVAEÚvqvaec                 C   sÞ   t |tjƒr(|jjjdd| jj d njt |tƒrd| jj	rH|jj 
¡  q’|jjjdd| jj d n.t |tƒr’| jj	r’|jjj 
¡  |jjj 
¡  t |tjƒr¸|jj 
¡  |jj d¡ t |tjƒrÚ|jd k	rÚ|jj 
¡  d S )Nr   ç{®Gáz”?©rÜ   rÊ   r•   )Ú
isinstancer   Ú	Embeddingr‡   ÚdataÚnormal_rW   Ú
init_scaler   Úzero_outÚzero_r”   rš   rˆ   r   Úfill_ÚLinear©r‰   Úmoduler&   r&   r'   Ú_init_weights^  s    
zJukeboxVQVAE._init_weightsr¿   c           
         sš  t ƒ  |¡ |j}|j}|js`dd„ t||ƒD ƒ}t |¡}|j|j	 | | |_|j 
t¡|_|jˆ _|jˆ _|jˆ _dd„ t||ƒD ƒˆ _t ˆ j¡ˆ _|j ˆ _}‡ fdd„t|ƒD ƒˆ _|jd k	rÐ|jndg| ˆ _t ¡ ˆ _t ¡ ˆ _t|ƒD ]}|jˆ j|  }|jˆ j|  }	ˆ j t|||	|d |d |d … |d |d … ƒ¡ ˆ j t|||	|d |d |d … |d |d … ƒ¡ qøt||ƒˆ _d S )Nc                 S   s   g | ]\}}|| ‘qS r&   r&   ©rò   ZstrideZdownr&   r&   r'   ró   t  s     z)JukeboxVQVAE.__init__.<locals>.<listcomp>c                 S   s   g | ]\}}|| ‘qS r&   r&   r  r&   r&   r'   ró     s     c                    s&   g | ]}t ˆ jˆ j| d    ƒ‘qS )r   )r3   rK   Úhop_lengths©rò   rX   ©r‰   r&   r'   ró   ‚  s    r   ) r   r‚   Úres_downs_tÚres_strides_trK   rM   rR   rØ   Úsample_length_in_secondsÚsampling_rateZastyper3   rÁ   ÚcommitÚdownsamplesZcumprodr  rG   r2   Úmusic_tokens_shapesZmultipliersr   r£   ÚencodersÚdecodersÚres_conv_widthÚres_conv_depthr>   r°   r»   rñ   Ú
bottleneck)
r‰   rW   r´   rµ   r  Ztop_raw_to_tokensrG   rX   r³   rœ   rŠ   r  r'   r‚   o  s@    
þ
ÿ

,ÿ,ÿzJukeboxVQVAE.__init__r   Nc                 C   sV   |d kr| j }| jj|||d}| j| |dd…  }}||dd}| ddd¡}|S )N©rö   r÷   r   r   F©r½   r*   )rG   r  rí   r  rÛ   )r‰   rT   rö   r÷   rÙ   ÚdecoderÚdequantised_stater&   r&   r'   Ú_decode–  s    zJukeboxVQVAE._decoder   ©Úreturnc           	         s^   ‡ fdd„|D ƒ}g }t ˆ ƒD ]0‰‡fdd„|D ƒ}| j|||d}| |¡ qtj|ddS )a½  
        Transforms the input `music_tokens` to their `raw_audio` representation.

        Args:
            music_tokens (`torch.LongTensor`):
                Tensor of music tokens which will be decoded to raw audio by using the codebook. Each music token
                should be an index to a corresponding `code` vector in the codebook.
            start_level (`int`, *optional*):
                Level at which the decoding process will start. Default to 0.
            end_level (`int`, *optional*):
                Level at which the decoding process will start. Default to None.
            bs_chunks (int, *optional*):
                Number of chunks to process at the same time.
        c                    s   g | ]}t j|ˆ d d‘qS ©r   r   )r   rL   )rò   Útoken©Ú	bs_chunksr&   r'   ró   °  s     z'JukeboxVQVAE.decode.<locals>.<listcomp>c                    s   g | ]}|ˆ  ‘qS r&   r&   )rò   Úchunks)rj   r&   r'   ró   ³  s     r  r   r   )r2   r!  r>   r   r,   )	r‰   rT   rö   r÷   r'  Ztoken_chunksrà   Úmusic_tokens_ir   r&   )r'  rj   r'   rí   ¡  s    zJukeboxVQVAE.decodec           
      C   sl   |d kr| j }| ddd¡ ¡ }g }t| j ƒD ]$}| j| }||ƒ}| |d ¡ q.| j |¡}	|	||… S )Nr   r*   r   r   )rG   rÛ   rO   r2   r  r>   r  rë   )
r‰   rô   rö   r÷   rù   rÙ   rX   ÚencoderÚlatent_staterT   r&   r&   r'   Ú_encode¸  s    
zJukeboxVQVAE._encodec           
      C   sN   t j||dd}g }|D ]}| j|||d}| |¡ qdd„ t|Ž D ƒ}	|	S )að  
        Transforms the `input_audio` to a discrete representation made out of `music_tokens`.

        Args:
            input_audio (`torch.Tensor`):
                Raw audio which will be encoded to its discrete representation using the codebook. The closest `code`
                form the codebook will be computed for each sequence of samples.
            start_level (`int`, *optional*, defaults to 0):
                Level at which the encoding process will start. Default to 0.
            end_level (`int`, *optional*):
                Level at which the encoding process will start. Default to None.
            bs_chunks (int, *optional*, defaults to 1):
                Number of chunks of raw audio to process at the same time.
        r   r   r  c                 S   s   g | ]}t j|d d‘qS r$  )r   r,   )rò   Zmusic_tokens_levelr&   r&   r'   ró   Ù  s     z'JukeboxVQVAE.encode.<locals>.<listcomp>)r   rL   r,  r>   rM   )
r‰   rù   rö   r÷   r'  Zaudio_chunksÚmusic_tokens_listZchunk_ir)  rT   r&   r&   r'   rë   Å  s    zJukeboxVQVAE.encodec                    s    ‡ ‡fdd„ˆj D ƒ}ˆ |¡S )Nc                    s&   g | ]}t jd ˆjˆ f|˜dd‘qS )r   rP   )r   r0   )r   ÚrandintrÁ   )rò   Úmusic_tokens_shape©Ú	n_samplesr‰   r&   r'   ró   Ý  s   ÿz'JukeboxVQVAE.sample.<locals>.<listcomp>)r  rí   )r‰   r1  rT   r&   r0  r'   r|   Ü  s    þzJukeboxVQVAE.sample)rô   r#  c                 C   s¼   |  ddd¡ ¡ }g }t| jƒD ]$}| j| }||ƒ}| |d ¡ q |  |¡\}}}	}g }
t| jƒD ]:}| j| }||||d … dd}|
 |  ddd¡¡ qft|	ƒ}| j	| }|
|fS )a"  
        Forward pass of the VQ-VAE, encodes the `raw_audio` to latent states, which are then decoded for each level.
        The commit loss, which ensure that the encoder's computed embeddings are close to the codebook vectors, is
        computed.

        Args:
            raw_audio (`torch.FloatTensor`):
                Audio input which will be encoded and decoded.

        Returns:
            `Tuple[torch.Tensor, torch.Tensor]`


        Example:
        ```python
        >>> from transformers import JukeboxVQVAE, set_seed
        >>> import torch

        >>> model = JukeboxVQVAE.from_pretrained("openai/jukebox-1b-lyrics").eval()
        >>> set_seed(0)
        >>> zs = [torch.randint(100, (4, 1))]
        >>> model.decode(zs).shape
        torch.Size([4, 8, 1])
        ```
        r   r*   r   r   Fr  )
rÛ   rO   r2   rG   r  r>   r  r  rÕ   r  )r‰   rô   rù   rÙ   rX   r*  r+  ré   rT   rú   rà   r  r   rð   Úlossr&   r&   r'   rŽ   ã  s    


zJukeboxVQVAE.forward)r   N)r   Nr   )r   N)r   Nr   )r   r‘   r’   r   Úconfig_classÚbase_model_prefixr  r‚   r!  r   ÚTensorrí   r,  rë   r|   ZFloatTensorr   rŽ   r“   r&   r&   rŠ   r'   rý   S  s   '


rý   c                       s$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )Ú
JukeboxMLPc                    sT   t ƒ  ¡  |j}t|j| ƒ}t||ƒ| _t||ƒ| _t|j	 | _
t |j¡| _d S r€   )r   r‚   Úhidden_sizer3   Zmlp_multiplierr   Úc_fcÚc_projr   Zact_fnÚactr   ÚDropoutÚresid_dropoutÚdropout)r‰   rW   r©   r   rŠ   r&   r'   r‚     s    
zJukeboxMLP.__init__c                 C   s,   |   |¡}|  |¡}|  |¡}|  |¡}|S r€   )r8  r:  r9  r=  )r‰   r   r&   r&   r'   rŽ     s
    



zJukeboxMLP.forwardr   r&   r&   rŠ   r'   r6    s   r6  c                       s*   e Zd Zd‡ fdd„	Z‡ fdd„Z‡  ZS )ÚJukeboxLayerNormçñhãˆµøä>Tc                    s.   t ƒ j|||d t |¡| _d| j | _d S )N)ÚepsÚelementwise_affineiÿÿ  )r   r‚   rR   rØ   r³   Ú	max_numel)r‰   Únormalized_shaper@  rA  rŠ   r&   r'   r‚   (  s    zJukeboxLayerNorm.__init__c                    sD   |  ¡ | jkr.t || j| j| j| j¡ |¡S t	ƒ  
|¡ |¡S d S r€   )ZnumelrB  r   Ú
layer_normrC  r‡   rˆ   r@  rŒ   r   rŽ   )r‰   ÚinputrŠ   r&   r'   rŽ   -  s     zJukeboxLayerNorm.forward)r?  Tr   r&   r&   rŠ   r'   r>  '  s   r>  c                       sà   e Zd Zd2‡ fdd„	Zdd„ Zdd„ Zd3d	d
„Zdd„ Zdd„ Zdd„ Z	dd„ Z
dd„ Zdd„ Zdd„ Zd4dd„Zd5dd„Zd6dd„Zd7d d!„Zed"d#„ ƒZd$d%„ Zd8d&d'„Zd(d)„ Zd*d+„ Zd9d,d-„Zd.d/„ Zd0d1„ Z‡  ZS ):ÚJukeboxAttentionÚ
dense_attnc              	      s”  t ƒ  ¡  |j| _|j| _|j| _t|j| j ƒ}||j | _	|| _
|| _| j	d | _|j| _|dkrˆt| j|ƒ| _t| j|d ƒ| _nt| j|d ƒ| _t|| jƒ| _t |j¡| _t |j¡| _|| _|dkrÜ| j| _n|dkrî| j| _n| j| _| jdf| jdf| jdf| jd f| jdf| jdf| jd f| j dfd	œ}|| \| _!| _"|j#| _#|j$| _$| j#d k	rv| j
| j# | _%d
| _&i | _'|j(| _)d| _*d S )Ng      Ð¿Úcross_attentionr*   r   Ú
prime_attnrm   ro   rr   )rG  Ú
block_attnÚtranspose_block_attnÚprev_block_attnÚsummary_attnÚsummary_spread_attnrH  rI  r   F)+r   r‚   r7  r©   Ún_headsÚattn_dropoutr=  r3   Zattention_multiplierZhead_dimr?   r   Úscalerx   r   Úc_attnÚc_enc_kvr9  r   r;  r<  Ú	attn_funcÚ
decode_qkvÚqkvÚ	prime_qkvÚfactored_qkvrG  rJ  rK  rL  rM  rN  rI  ÚattnÚ	attn_maskrz   r{   Ú	block_ctxr}   ÚcacheÚnb_relevant_lyric_tokensÚencoder_lenÚrecord_attn)r‰   rW   r?   rT  r   ZATTENTION_MAPrŠ   r&   r'   r‚   5  sR    


ø
zJukeboxAttention.__init__c              	   C   s  | j }| jr"t || || ¡}nt ||¡}| || ¡ |j}| ¡ }| jr˜t| j	| 
d¡| 
d¡| j| j|j|| jƒ}|d k	r˜|| dd|   }tj|dd |¡}	| jrê|	| _| jdkrê| jd d …d d …| jd …d | j…f | _|  |	¡}	t |	|¡}
|
S )Néþÿÿÿr   g    eÍÍÁr   r   rI  )rQ  rø   r   rÔ   Zmul_r   rO   rx   r~   rZ  r   rz   r{   r0   r}   r   r    Útyper_  Úattention_probrT  r^  rP  )r‰   Zquery_statesZ
key_statesZvalue_statesr|   rQ  Zattention_weightZattn_weight_typerx   rb  Úcontext_statesr&   r&   r'   Ú_attnk  s8    ø

(
zJukeboxAttention._attnc                 C   sD   |  dddd¡ ¡ }| ¡ d d… | d¡| d¡ f˜}|j|Ž S )Nr   r*   r   r   r`  r   )rÛ   rw   r   rt   )r‰   r   Únew_hidden_states_shaper&   r&   r'   Úmerge_heads  s    &zJukeboxAttention.merge_headsFc                 C   sX   |  ¡ d d… | j|  d¡| j f˜}|j|Ž }|rD| dddd¡S | dddd¡S d S )Nr   r   r*   r   r   )r   rO  rt   rÛ   )r‰   r   Úis_keyre  r&   r&   r'   Úsplit_heads’  s    ý
zJukeboxAttention.split_headsc                 C   s@   |   |¡}| j |dd}|   |¡}|  ||||¡}|  |¡}|S )NT)rg  )rh  rd  rf  )r‰   ÚqueryÚkeyrq   r|   rc  r&   r&   r'   rG  ž  s    


zJukeboxAttention.dense_attnc           
      C   sÚ   | j }|j\}}}|r0|  ||||¡ |d|¡S |jd }	| ||	 | ||¡}|	|k r|	}|d d …| d …f  ¡ }|d d …| d …f  ¡ }| || | ||¡}| || | ||¡}|  ||||¡ |||¡S d S ©Nr   )r[  rH   rG  rt   rw   )
r‰   ri  rj  rq   r|   r[  rY   rê   r©   ry   r&   r&   r'   rJ  ¦  s    
zJukeboxAttention.block_attnc                 C   sv  | j }|j\}}}|rt|d | }	|d d …|	d |…d d …f }|d d …|	d |…d d …f }|  ||||¡ |d|¡S |jd }
| ||
| ||¡}| dd¡ ¡ }| || |
| |¡}| ||| ||¡}| dd¡ ¡ }| || || |¡}| ||| ||¡}| dd¡ ¡ }| || || |¡}|  ||||¡}| |||
| |¡}| dd¡ ¡ }| ||
|¡}|S d S )Nr   r*   )r[  rH   rG  rt   Z	transposerw   )r‰   ri  rj  rq   r|   r[  rY   rê   r©   Z	block_lenry   rJ  r&   r&   r'   rK  ¶  s,    
z%JukeboxAttention.transpose_block_attnc                 C   s6  | j }|j\}}}|r¾|d | }	|	d | }
|	dkrt|d d …|
|
| …d d …f }|d d …|
|
| …d d …f }n0tj||||j|jd}tj||||j|jd}|  ||||¡ |d|¡S |jd }| || | ||¡}| ||| ||¡d d …d d…d d …d d …f }tjj	 
|d¡}| || | ||¡}| ||| ||¡d d …d d…d d …d d …f }tjj	 
|d¡}| || | ||¡}||k r|| }|| }|}| ||||¡d d …| d …f }| ¡  || ||¡}| ||||¡d d …| d …f }| ¡  || ||¡}|  ||||¡ |||¡S d S )Nr   r   ©r0   r   r   ©r   r   r   r   r   r   )r[  rH   r   r-   r0   r   rG  rt   r   ru   rv   rw   )r‰   ri  rj  rq   r|   r[  rY   rê   r©   r§   Zprev_lry   Znb_query_blocksZnb_key_blocksr&   r&   r'   rL  Ó  s8     
00
""z JukeboxAttention.prev_block_attnc           
      C   s<  | j }| j}|j\}}}	|r¦|d d …|d || d |…d d …f }tjj |d¡}|d d …|d || d |…d d …f }tjj |d¡}|  ||||¡ |d|	¡S | |||| |	¡d d …d d…dd d …f }tjj |d¡}| |||| |	¡d d …d d…dd d …f }tjj |d¡}|  ||||¡ |||	¡S d S )Nr   rp   r   )	rz   r[  rH   r   r   ru   rv   rG  rt   )
r‰   ri  rj  rq   r|   rz   r[  rY   rê   r©   r&   r&   r'   rM  ø  s    ((,,zJukeboxAttention.summary_attnc           
      C   sð   | j }| j}|j\}}}	|r"t‚nÊ| |||| |	¡d d …d d…| d …d d …f }tjj |d¡ 	¡ }| ||| |	¡}| |||| |	¡d d …d d…| d …d d …f }tjj |d¡ 	¡ }| ||| |	¡}|  
||||¡ |||	¡S d S )Nr   rm  )rz   r{   rH   ÚNotImplementedErrorrt   r   r   ru   rv   rw   rG  )
r‰   ri  rj  rq   r|   rz   r{   rY   rê   r©   r&   r&   r'   rN    s    22z$JukeboxAttention.summary_spread_attnc                 C   s>   | j }|d d …d |…f }|d d …d |…f }|  ||||¡S r€   )Ú_encoder_lenrG  )r‰   ri  rj  rq   r|   r^  r&   r&   r'   rI    s    zJukeboxAttention.prime_attnNc           	      C   sÊ   |j d }|d k	rtdƒ‚|jddd\}}}|r¾|  j|7  _|  ||¡\}}|  ¡ }|  ¡ |krp|  | ¡ |dkrª| jdkr¤| j	|dd}|  	|¡}|  	|¡}d	}n| j
d
 }| j
d }||||fS )Nr   ú)last_encoder_hidden_states should be Noner   r*   r   rG  T)ri  Frj  rq   )rH   Ú	TypeErrorrL   r}   Ú_append_cacheÚ_suff_cache_lenÚ
_cache_lenÚ_slice_cacherT  Ú_pad_to_block_ctxr\  )	r‰   r   Úlast_encoder_hidden_statesr|   Úcurr_ctxri  rj  rq   Zl_cacher&   r&   r'   rX  #  s&    





zJukeboxAttention.factored_qkvc                 C   s˜   |j d }|d k	rtdƒ‚|jddd\}}}|rŒ|  ¡ | jk rL|  ||¡ |  ¡ | jkrh|  d| j¡ | jd | jd  }}|  j|7  _||||fS )	Nr   rp  r   r*   r   r   rj  rq   )	rH   rq  rL   rt  ro  rr  ru  r\  r}   ©r‰   r   rw  r|   rx  ri  rj  rq   r&   r&   r'   rW  :  s    
zJukeboxAttention.prime_qkvc                 C   s–   |j d }|}|rl| jdkrF|  | |¡¡jddd\| jd< | jd< | jd | jd  }}|  j|7  _n|  | |¡¡jddd\}}||||fS )Nr   r   r*   r   rj  rq   )rH   r}   rS  rŒ   rL   r\  ry  r&   r&   r'   rU  H  s    

ÿ þzJukeboxAttention.decode_qkvc           
      C   sŠ   |j d }|  |¡}| j|||d\}}}}|  ||||¡}|j d |krv|  |¡}	|d d …|	|	| …d d …f  ¡ }|  |¡}|  |¡S )Nr   ©rw  r|   )rH   rR  rV  rY  Ú_offsetrw   r9  r<  )
r‰   r   rw  r|   rx  ri  rj  rq   Zattention_scoresr8   r&   r&   r'   rŽ   V  s    

  ÿ
"
zJukeboxAttention.forwardc                 C   s   | j }|| j d }|| j S rk  )r^  rz   )r‰   r^  Zencoder_blocksr&   r&   r'   ro  c  s    zJukeboxAttention._encoder_lenc                 C   s   | j dkrdS | j| | j S )NrG  r   )rT  r}   r[  )r‰   rx  r&   r&   r'   r{  i  s    
zJukeboxAttention._offsetc                 C   sr   |j d }|r|  |¡nd}|| | j d | j }|| j | | }|dkrZ|dkrZ|S t |dd||f¡S d S )Nr   r   )rH   r{  r[  r   rv   )r‰   r   ri  rê   r8   Zn_blocksrv   r&   r&   r'   rv  n  s    
z"JukeboxAttention._pad_to_block_ctxc                 C   s   d| j krdS | j d jd S )Nrj  r   r   )r\  rH   r  r&   r&   r'   rt  x  s    zJukeboxAttention._cache_lenc                 C   sh   | j d | j d | j }| j | j d | j d | j | j | jkrF| j n|| jt| j | jƒdœ}|| j S )z´
        Precondition:
            key and value are appended with the current context and self.sample_t reflects the 1-indexed sample
            location in the context.
        r   )rG  rJ  rK  rL  Z
cross_attnrI  )r}   r[  r^  r   ro  rT  )r‰   Zprevious_block_lengthZREQUIRED_CACHE_LENr&   r&   r'   rs  {  s    ú	z JukeboxAttention._suff_cache_lenc                 C   sD   | j d d d …||…f | j d< | j d d d …||…f | j d< d S )Nrj  rq   )r\  )r‰   rA   rZ   r&   r&   r'   ru    s     zJukeboxAttention._slice_cachec                 C   s–   d| j kr || j d< || j d< nb|| }}tj| j d |gdd}tj| j d |gdd}| j d= | j d= ~~|| j d< || j d< | j d | j d fS )Nrj  rq   r   r   )r\  r   r,   )r‰   rj  rq   Úold_keyÚ	old_valuer&   r&   r'   rr  ‘  s    




zJukeboxAttention._append_cachec                 C   s4   d| _ d| jkr| jd= d| jkr*| jd= i | _d S )Nr   rj  rq   )r}   r\  r  r&   r&   r'   Ú	del_cache¡  s    

zJukeboxAttention.del_cache)rG  )F)NF)NF)NF)NF)F)N)r   r‘   r’   r‚   rd  rf  rh  rG  rJ  rK  rL  rM  rN  rI  rX  rW  rU  rŽ   Úpropertyro  r{  rv  rt  rs  ru  rr  r~  r“   r&   r&   rŠ   r'   rF  4  s0   6"
%







rF  c                       s(   e Zd Zd‡ fdd„	Zddd„Z‡  ZS )	ÚJukeboxBlockrG  c                    sd   t ƒ  ¡  |j| _t|||d| _t|jƒ| _t|ƒ| _	t|jƒ| _
|jrTd|j nd| _|| _d S )N©rT  r•   )r   r‚   r7  r³   rF  rY  r>  Úlayer_norm_0r6  ÚmlpÚlayer_norm_1Zattn_res_scaleÚ
num_layersr–   rT  )r‰   rW   r?   rT  rŠ   r&   r'   r‚   «  s    

zJukeboxBlock.__init__Fc                 C   sb   |}|   |¡}|  |||¡}|  || ¡}|  |¡}| jdkrL|| | }n|| j||   }|S )Nr•   )r‚  rY  r„  rƒ  r–   )r‰   r   rw  r|   rŸ   Zoutput_statesÚoutputr&   r&   r'   rŽ   ¶  s    


zJukeboxBlock.forward)rG  )Fr   r&   r&   rŠ   r'   r€  ª  s   r€  c                       s6   e Zd Z‡ fdd„Zdd„ Zddd„Zd	d
„ Z‡  ZS )ÚJukeboxLayerStackc                    sž   t ƒ  ¡  || _|j| _|j| _|j| _|j| _| jd k	rF|| j | _|j	| _
|j| _t| j }t ¡ | _t| jƒD ]}| j t||||ƒd¡ qtg | _d S )Nr  )r   r‚   r?   r7  r³   r…  rz   Úattention_patternr[  r]  r^  rO  r   r   r£   Ú
_attn_modsr2   r>   r€  Úsaved_attn_weights)r‰   rW   r?   rˆ  rœ   rŠ   r&   r'   r‚   Å  s    



zJukeboxLayerStack.__init__c                    s:   ‡ fdd„}t | jƒD ]\}}||ƒ|j_qˆ s6g | _dS )a-  
        Makes forward prop dump self-attention softmaxes to self.saved_attn_weights.

        Args:
            record_attn (`Union[bool,set]`):
                Either a set of layer indices indicating which layers to store, or a boolean value indicating Whether
                to dump all.
        c                    s   t ˆ tƒrˆ S | ˆ kS r€   )r  r!   )Z	layer_idx©r_  r&   r'   Ú_should_record_attnã  s    
z>JukeboxLayerStack.set_record_attn.<locals>._should_record_attnN)Ú	enumerater‰  rY  r_  rŠ  )r‰   r_  rŒ  rj   Úlayerr&   r‹  r'   Úset_record_attnÙ  s
    
z!JukeboxLayerStack.set_record_attnNFc                 C   sZ   t | jƒD ]J\}}|jdkr,||||d}n||d |d}|jjr
| j |jjj¡ q
|S )NrH  rz  )	r  r‰  rT  rY  r_  rŠ  r>   rR  r‡   )r‰   r   rw  r|   rj   Ú
attn_layerr&   r&   r'   rŽ   î  s    
  ÿzJukeboxLayerStack.forwardc                 C   s   | j D ]}|j ¡  qd S r€   )r‰  rY  r~  )r‰   r  r&   r&   r'   r~  û  s    
zJukeboxLayerStack.del_cache)NF)r   r‘   r’   r‚   r  rŽ   r~  r“   r&   r&   rŠ   r'   r‡  Ä  s   
r‡  c                       s$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )ÚJukeboxPositionalEmbeddingc                    s$   t ƒ  ¡  t t ||f¡¡| _d S r€   )r   r‚   r   r†   r   r…   Úpos_emb)r‰   r©   r³   rŠ   r&   r'   r‚     s    
z#JukeboxPositionalEmbedding.__init__c                 C   s
   | j }|S r€   )r’  )r‰   r’  r&   r&   r'   rŽ     s    z"JukeboxPositionalEmbedding.forwardr   r&   r&   rŠ   r'   r‘     s   r‘  c                	       sL   e Zd Zd‡ fdd„	Zddd„Zdd„ Zddd„Zdd„ Zddd„Z‡  Z	S )Ú JukeboxConditionalAutoregressiveNFc                    s&  t ƒ  ¡  |j| _|j| _|dk	r&|n|j| _|dk	r:|n|j| _t 	| j|j¡| _
t |j¡| _|| _|| _|sŠt t d|jf¡¡| _t| j|jƒ| _t |j¡| _t|| jd| _|| _|j| _|jrÚd| _d| _nd| _d| _|s"tj|j| jdd| _ | jr| j
j!| j _!tj "¡ | _#dS )aa  
        Autoregressive model on either lyric tokens or music tokens, or both. The attention pattern should be properly
        set fro each configuration.

        Args:
            config (`JukeboxPriorConfig`):
                Model configuration class with all the parameters of the model. Initializing with a config file does
                not load the weights associated with the model, only the configuration. Check out the
                [`~PreTrainedModel.from_pretrained`] method to load the model weights.
            n_ctx (`int`, *optional*):
                Number of tokens or lyrics tokens provided in a single pass.
            embed_dim (`int`, *optional*):
                Either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codeboook dimension,
                if the model combines lyrics and music tokens, or simply n_vocab if the model is a seperate encoder
            audio_conditioning (`bool`, *optional*, defaults to `False`):
                Whether or not the prior supports conditionning on audio.
            metadata_conditioning (`bool`, *optional*, defaults to `False`):
                Whether or not the prior supports conditionning on artitst, genres, lyrics and timing.
            is_encoder (`bool`, *optional*, defaults to `False`):
                Whether the model is an encoder only model.
        Nr   )r?   FT©rˆ   )$r   r‚   r7  r³   r…  r?   Úmusic_vocab_sizer©   r   r  Úembed_tokensr;  Zemb_dropoutÚembed_tokens_dropoutÚmetadata_conditioningÚaudio_conditioningr†   r   r…   Ústart_tokenr‘  r’  Úpos_emb_dropoutr‡  ÚtransformerÚ
is_encoderr]  r^  Zmerged_decoderÚadd_cond_after_transformerZshare_embed_tokens_fc_proj_outr	  Úfc_proj_outr‡   ÚCrossEntropyLossr2  )r‰   rW   r?   r©   r™  r˜  r  rŠ   r&   r'   r‚     s4    
z)JukeboxConditionalAutoregressive.__init__c              	   C   s"  |j d }t ¡  | |d¡ ¡ }W 5 Q R X | js^tj|d| jf|j| j	j
d jjjjd}|}	|  |¡}
tj|
dd…dd…f |
dd…dd…f fdd}
| jrÀ| || j¡|
dd…df< n| j|
dd…df< |  |
¡|  |  ¡ ¡ | }
| j	|
|d}
| jr|
| }
|
}| jr|
S |  |
¡}
t ¡ }|rÖ|
dd…d| j…f  d| j¡}|
dd…| jd…f  d| j¡}|||	dd…d| j…f  d¡ƒt d¡ }|||	dd…| jd…f  d¡ƒt d¡ }||f}n$||
 d| j¡|	 d¡ƒt d¡ }|r||
fS |r||fS |dfS dS )	zŸ
        Args:
            tokens (`torch.tensor`):
                Can represent music tokens, lyrics tokens or both, depending on the configuration.
        r   r   r   rl  Nr   )rw  r)   ) rH   r   rÓ   rt   r.   r™  r-   r³   r0   rœ  r‰  rƒ  r8  r‡   r   r–  r,   r˜  rš  r—  r›  r’  rž  r  rŸ  r   r   r^  Zreshaper©   rR   rÖ   )r‰   r:   r™  r˜  rw  Ú	get_predsZget_actsÚget_sep_lossrY   Útargetr   ÚactivationsZloss_fnZlyric_hidden_statesZtoken_hidden_statesZ
lyric_lossZmusic_token_lossr2  r&   r&   r'   rŽ   J  sP    


ý
2ÿ ÿ
  ,,
$z(JukeboxConditionalAutoregressive.forwardc                 C   sÊ   |dkrdt j|d| j| jjjd | jjj¡}| jrP| 	|| j¡|d d …df< qn| j
|d d …df< n
|  |¡}|j|| j| jfkr¢|d d …||d …d d …f }n|}||  ¡ ||d …  | }||fS )Nr   r   r   )r   r…   r³   r–  r‡   r   r/   r0   r˜  rt   rš  rH   r?   r’  )r‰   r}   r1  r:   r™  r˜  r   Úcondr&   r&   r'   Úget_emb  s    ÿ
 z(JukeboxConditionalAutoregressive.get_embr•   r   r   c
              	   C   sd  |	d kr| j }	| jsDtj|d| jf| jjd jjj	j
d | jj¡}t ¡ ü g }
d }|r^g }ttd|	ƒdd}|D ] }|jd|	› ddd	 |  |||||¡\}}| j||dd
}| jrÂ|| }|  |¡}|rÞ| | ¡ ¡ || }t|||d}tjj|d ¡ }|
 | ¡ ¡ qt~| j ¡  tj|
dd}|rDtj|dd}W 5 Q R X |r\||fS |S d S )Nr   r   r   F©ÚleavezAncestral sampling ú music tokensT©Úrefreshrz  ©r$   r%   ©r#   r   )r?   r™  r   r-   r³   rœ  r‰  rƒ  r8  r‡   r   r/   rŸ  r0   rÓ   r   r2   Úset_descriptionr¦  rž  r>   r   r(   ÚdistributionsÚCategoricalr|   r~  r,   )r‰   r1  r™  r˜  rw  Útempr$   r%   r¡  Úsample_tokensrü   r:   ÚpredsÚiterr}   r   r¥  r&   r&   r'   r|   £  s\    
 ÿþ
    ÿ  ÿ

z'JukeboxConditionalAutoregressive.samplec                 C   s4   || d | }|g|d  |d | d f•}|S rk  r&   )r‰   ÚlengthÚ
chunk_sizeÚn_passesÚchunk_sizesr&   r&   r'   Úsplit_chunksÞ  s     z-JukeboxConditionalAutoregressive.split_chunksc              
   C   sØ  |d kr| j }|jd }t ¡  | |d¡ ¡ }W 5 Q R X tj|ddd}t|ƒ}| jsˆtj	|d| j
f| jjd jjjjd |j¡}t ¡ * |	rœg }|
d kr¬t|ƒ}
|  t|ƒ|
¡}g }d}d }t|dddD ]¼}g g  }}t||| ƒD ]6}|  |||||¡\}}|| }| |¡ | |¡ qò|| }tj|ddtj|dd }}~~|	s\~| j||d	d
}|	r| jr‚|| }~| |¡ qÖ~qÖ|	r¼tj|dd}|  |¡}| |¡ |d }ttt|ƒ|ƒdttt|ƒ|ƒƒ› ddd}|D ]}|  |||||¡\}}| j||d	d
}| jr0|| }|  |¡}|	rJ| |¡ || }t|||d}tjj|d ¡ }| |  ¡ ¡ |}qö~~| j !¡  tj|dd}|	r¸tj|dd}W 5 Q R X |	rÐ||fS |S d S )Nr   r   r   r   r   zPreparing past key valueF)rD   r¨  Trz  ú	Sampling r©  r¬  r­  )"r?   rH   r   rÓ   rt   r.   Úsplitr1   r™  r-   r³   rœ  r‰  rƒ  r8  r‡   r   r/   r0   r+   r¹  r   r2   r¦  r>   r,   rž  rŸ  r(   r¯  r°  r|   r   r~  )r‰   r1  Úlyric_and_music_tokensr™  r˜  rw  r±  r$   r%   r¡  r¶  r²  rY   Zsampled_audior³  r¸  Zx_primesrA   r%  Zcurrent_chunk_sizeZsampled_audio_primeZconds_primer}   Zx_primeZ
cond_primeZinput_tokensZ
itereratorr   r¥  rT   r&   r&   r'   Úprimed_sampleã  s´    


 ÿþ
    ÿ


ý    ÿ  ÿ


z.JukeboxConditionalAutoregressive.primed_sample)NNFFF)NNNFFF)NNNr•   r   r   FN)	NNNr•   r   r   FNN)
r   r‘   r’   r‚   rŽ   r¦  r|   r¹  r½  r“   r&   r&   rŠ   r'   r“  
  sD        ùB      ø
F        ö
;	         ôr“  c                       s*   e Zd ZdZ‡ fdd„Zddd„Z‡  ZS )ÚJukeboxMusicTokenConditionerzç
    The `JukeboxMusicTokenConditioner` takes music tokens as an input (coresponding to the codes of the VQVAE's
    codebook) and upsamples it using a single layer of decoder convolution block (the same is used in the VQVAE).
    c              	      s^   t ƒ  ¡  t |j|j¡| _|j|_t||j|j	|j
|j| |j| dd| _t|jƒ| _d S )NF)r¥   )r   r‚   r   r  r•  r7  r–  r©   r¸   r  r  r  r  Ú	upsamplerr>  rD  )r‰   rW   rX   rŠ   r&   r'   r‚   Z  s    
ù	z%JukeboxMusicTokenConditioner.__init__Nc                 C   sZ   |dkrd}|  ¡ }|  |¡}|| }| ddd¡}|  |¡}| ddd¡}|  |¡}|S )a?  
        Args:
            music_tokens (`torch.LongTensor`):
                Music tokens form the uper level in range(nb_discrete_codes)
            raw_audio_conditionning (`torch.LongTensor`, *optional*):
                Audio used when primed sampling, raw audio information that conditions the generation
        Nr   r   r*   r   )r.   r–  rÛ   r¿  rD  )r‰   rT   Zraw_audio_conditionningr   r&   r&   r'   rŽ   j  s    


z$JukeboxMusicTokenConditioner.forward)N©r   r‘   r’   Ú__doc__r‚   rŽ   r“   r&   r&   rŠ   r'   r¾  T  s   r¾  c                       s,   e Zd ZdZd‡ fdd„	Zd	dd„Z‡  ZS )
ÚJukeboxRangeEmbeddinga“  
    The `JukeboxRangeEmbedding` interpolate the given [pos_start, pos_end] to obtain an equivalent of time positional
    embedding of length `n_ctx`.

    Binning process : For each pos in position tensor, find its bin [start,end) mapped to [0,1,...,bins-1] [start,end)
    -> [0,1) -> [0, bins) -> floor -> [0,...,bins-1] NOTE: Open ended interval on right, so start <= pos < end, not <=
    end
    Fc                    s:   t ƒ  ¡  || _|| _t ||¡| _|\| _| _|| _	d S r€   )
r   r‚   Ún_timer©   r   r  ÚembÚpos_minÚpos_maxrd   )r‰   rÃ  r©   r2   Z	out_widthrd   rŠ   r&   r'   r‚   ‹  s    
zJukeboxRangeEmbedding.__init__Nc                 C   s  t |jƒdkstd|j› ƒ‚| j|k ¡ sX|| jk  ¡ rXtd| j› d| j› d|› ƒ‚| ¡ }|d k	r†| jr~| | j| j¡}| ¡ }| j}|dkrÈt	j
d|t	j|jd d|¡| }||| |  }n|}|| j | j| j  }| j|  ¡  ¡  ¡ }|  |¡S )	Nr*   z Expected shape with 2 dims, got z
Range is [ú,z), got r   r   rC   )r+   rH   rq  rÅ  ÚallrÆ  rO   rd   rÃ  r   Zaranger0   rt   r©   Úfloorr.   rî   rÄ  )r‰   Z	pos_startZpos_endrÃ  ÚinterpolationÚpositionZnormalised_positionZbins_r&   r&   r'   rŽ   “  s$     ÿzJukeboxRangeEmbedding.forward)F)NrÀ  r&   r&   rŠ   r'   rÂ    s   	rÂ  c                       s$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )ÚJukeboxLabelConditionerc                    sº   t ƒ  ¡  |j}|j}|j}|j\}}|j}|j| _t 	||¡| _
t 	||¡| _|| _| jr¶|j| |j| f}	d|j| f}
d}td||	|ƒ| _t|||
|ƒ| _t||||dd| _d S )Nr   )r   r•   r   T)rd   )r   r‚   r7  Útiming_dimsr  Zmetadata_dimsr?   Úmax_nb_genresr   r  Úbow_genre_embÚ
artist_embÚinclude_time_signalÚmin_durationÚmax_durationrÂ  Útotal_length_embÚabsolute_pos_embÚrelative_pos_emb)r‰   rW   rÑ  r©   rÍ  r  Z	nb_genresZ
nb_artistsr/  Ztotal_length_rangeZabsolute_pos_rangeZrelative_pos_rangerŠ   r&   r'   r‚   ²  s8    

   ÿ    ÿz JukeboxLabelConditioner.__init__c                 C   s  |d d …dd…f }|d d …dd…f }|d d …dd…f }|d d …dd…f }|d d …dd …f }|   |¡}|dk ¡  d¡}|  | d¡¡| jddd}	|	| }
| jr |||  }}| ¡ }| ¡ }| ¡ }|  |¡|  ||¡ |  	|| || ¡ }nd }|
|fS )Nr   r   r*   r   é   Trã   )
rÐ  rO   r5   rÏ  rd   rÕ   rÑ  rÔ  rÕ  rÖ  )r‰   r[   r7   r8   rµ  ZartistZgenrerÐ  rx   Z	genre_embZ	start_embrA   rZ   r’  r&   r&   r'   rŽ   Ë  s,    

ÿþÿzJukeboxLabelConditioner.forwardr   r&   r&   rŠ   r'   rÌ  ±  s   rÌ  c                       sâ   e Zd ZdZeZdd„ Zd+edœ‡ fdd„Zd,d
d„Zdd„ Z	dd„ Z
dd„ Zdd„ Zdd„ Zd-dd„Zd.dd„Zdd„ Zd/d d!„Zd0d"d#„Zd$d%„ Zg dd	d	fd&d'„Zd1ejeeej  ee ee eej d(œd)d*„Z‡  ZS )2ÚJukeboxPrioruˆ  
    The JukeboxPrior class, which is a wrapper around the various conditioning and the transformer. JukeboxPrior can be
    seen as language models trained on music. They model the next `music token` prediction task. If a (lyric) `encoderÃ¹
    is defined, it also models the `next character` prediction on the lyrics. Can be conditionned on timing, artist,
    genre, lyrics and codes from lower-levels Priors.

    Args:
        config (`JukeboxPriorConfig`):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
        level (`int`, *optional*):
            Current level of the Prior. Should be in range `[0,nb_priors]`.
        nb_priors (`int`, *optional*, defaults to 3):
            Total number of priors.
        vqvae_encoder (`Callable`, *optional*):
            Encoding method of the VQVAE encoder used in the forward pass of the model. Passing functions instead of
            the vqvae module to avoid getting the parameters.
        vqvae_decoder (`Callable`, *optional*):
            Decoding method of the VQVAE decoder used in the forward pass of the model. Passing functions instead of
            the vqvae module to avoid getting the parameters.
    c                 C   sŽ  | j j}t|tjƒr.|jjjdd| d nt|tƒrf| j j	rN|jj 
¡  n|jjjdd| d nÖt|tƒrˆ|jjjdd| d n´t|tƒr¬|jjjjdd| d nt|tƒrÚt|dƒrÚ|jjjjdd| d nbt|tƒr
t|dƒr
|jjjdd| d n2t|tƒr<| j j	r<|jjj 
¡  |jjj 
¡  t|tjƒrd|jj 
¡  |jj d¡ t|tjƒrŠ|jd k	rŠ|jj 
¡  d S )Nr   rÿ   r   rÈ   Úlm_headrš  r•   )rW   r  r  r   r  r‡   r  r  r   r  r  r‘  r’  rÂ  rÄ  r“  ÚhasattrrÙ  rš  r”   rš   Zweigthrˆ   r   r  r	  )r‰   r  r  r&   r&   r'   r    s.    


zJukeboxPrior._init_weightsNr   r¿   c              
      sh  t ƒ  |¡ || _|| _|| _|d k	r*|n|j| _d| j› | _|j| _|jdk| _	|j| _|j
| _
| jdk| _| jd | _| jrt|| jƒ| _|j| _| jr°t|| j d| _|j| _|jr|j|jg| _d|jg| _|j| _|j| _t||j|j |j|j | jp| jdd| _n¨|j}| jdkr | j	r |j| _|j| _|j| _t|| j| jdddd| _t |j|jƒ| j_!t"|jƒ| j_#t$j%|j|jdd	| j_&nd| _t|| jp¶| j| jd
| _|j| _'| j| j' | _(dd„ t)|j*|j+ƒD ƒ| _,| jdkr
| j,| j nd | _-t. /| j,d || j … ¡| _0| j| j0 | _1t2 3d| j› d| j-› d| j0› d| j1› ¡ d S )Nzpriors.r   r   )rÑ  T)r?   r©   r™  r˜  F)r?   r©   r™  r˜  r  r”  )r™  r˜  c                 S   s   g | ]\}}|| ‘qS r&   r&   r  r&   r&   r'   ró   m  s     z)JukeboxPrior.__init__.<locals>.<listcomp>zLevel:z, Cond downsample:z, Raw to tokens:z, Sample length:)4r   r‚   Úvqvae_encoderÚvqvae_decoderrG   rX   r4  r?   r]  Úlyric_conditioningÚencoder_loss_fractionr™  Ú
cond_levelr¾  Úconditioner_blocksr˜  rÌ  Úmetadata_embeddingÚis_encoder_decoderÚinput_shapesZlyric_vocab_sizeÚembed_dim_shiftr7  r³   r“  r•  rV   Úencoder_configZlyric_acts_widthZencoder_widthÚencoder_dimr*  r   r¹   r>  Úfinal_layer_normr   r	  rÙ  Únext_token_prediction_loss_dimsÚtotal_loss_dimsrM   r  r  r  Úcond_downsamplerR   rØ   Úraw_to_tokensrK   ÚloggerÚinfo)r‰   rW   rX   Ú	nb_priorsrÛ  rÜ  rå  rŠ   r&   r'   r‚     sz    

û

úý"ÿzJukeboxPrior.__init__Fc                 C   s|   |  ¡ }||d d …df< t| jƒ|d d …df< t|| j ƒt|| j ƒ |d d …dd…f< |  |¡\}}|rt||fS |S d S rì   )r   r3   rK   rë  Úset_metadata_lyric_tokens)r‰   rU   rA   r7   r8   rE   r[   r;   r&   r&   r'   rJ   w  s    ,zJukeboxPrior.get_metadatac                 C   sô   | j dkrètj|jd | j ftj|jd}g }t|jd ƒD ]|}| ¡ dd…d| jj	 d…f }||df ||df ||df   }}}t
|| j |||ƒ\}	}
|	||dd…f< | |
¡ q<tj|dd…dd| jj	 …f |fdd|fS |dfS dS )	z
        Processes the full labels to only retreive the relevant lyric tokens and keep the metadata conditioning tokens.
        r   rC   Nr×  r   r*   r   r   )r]  r   r-   rH   r.   r0   r2   r   rá  rÎ  r<   r>   r,   )r‰   rU   Ztokens_listZindices_listÚidxr6   r7   r8   r9   r:   r;   r&   r&   r'   rï  ˆ  s.    
  ÿ (    ÿ(þz&JukeboxPrior.set_metadata_lyric_tokensc                 C   s”   | j dkrŒ|| j d  }|dd…|| j || j …f }| j| j |d jd  }|dkr„t d|¡ |j¡}tj||fdd 	¡ }|g}nd}|S )zE
        Extracts current level's conditioning music tokens.
        r   r   Nr   r   )
rX   rê  r?   rH   r   r-   r/   r0   r,   r.   )r‰   rT   rA   rZ   Úmusic_tokens_condZmissing_cond_lenZ	init_condÚmusic_tokens_condsr&   r&   r'   Úget_music_tokens_conds¡  s    
 z#JukeboxPrior.get_music_tokens_condsc                 C   sª   |d j d }tt|ƒƒD ]&}|| t| j| ƒ  |d¡||< qtt|ƒƒD ]>}|| dkrNtj|| j| | j	f|d j
|d jd||< qNtj|ddtj|ddfS )zÅ
        Shifts the input tokens to account for the dictionary merge. The embed_dim_shift give by how much the music
        tokens should be shifted by. It is equal to `lyric_vocab_size`.
        r   r   NrC   r   r   )rH   r2   r+   r3   rä  rt   r   r-   rã  r³   r   r0   r,   )r‰   r:   ZcondsrY   rj   r&   r&   r'   Úprior_preprocess±  s    $  ÿzJukeboxPrior.prior_preprocessc                 C   s’   |j d }| jd |j d | jd  f}ttj||ddƒ}tt|ƒƒD ]@}t| j| ƒ}|| |  	|d¡||< tj
|| dd||< qH|d S )zü
        Shifts back the input tokens if the model uses an encoder decoder architecture. As the embedding layer is
        shared, `prior_embed_dim_shift` shifts the music token ids by `lyric_vocab_size`. Only returns the music
        tokens.
        r   r   r   r   )r   )rH   rã  r1   r   r»  r2   r+   r3   rä  rt   rd   )r‰   r:   rY   Zdimsrj   Z
bins_shiftr&   r&   r'   Úprior_postprocessÂ  s    
zJukeboxPrior.prior_postprocessc                 C   sD   |d| j d … }d}ttt|| jgƒƒƒD ]\}}|||ƒ}q,|S )zj
        Embeds the upper level music tokens and upsamples them to provide as audio conditioning.
        Nr   )rß  rS   r1   rM   rà  )r‰   rò  r™  rñ  Zconditioner_blockr&   r&   r'   r–  Ô  s
    zJukeboxPrior.embed_tokensr   c              	   C   sF   |dkr| j }|dkr| j}t ¡  | j||||d}W 5 Q R X |S )zi
        Encodes the hidden states (raw audio) using the VQVAE's encoder. Returns latent_states.
        N©rö   r÷   r'  )rX   rG   r   rÓ   rÛ  )r‰   r   rö   r÷   r'  rÙ   r&   r&   r'   rë   Þ  s    
   ÿzJukeboxPrior.encodec              	   C   sF   |dkr| j }|dkr| j}t ¡  | j||||d}W 5 Q R X |S )zK
        Usamples the sequence of codebook vectors to a raw audio.
        Nrö  )rX   rG   r   rÓ   rÜ  )r‰   rT   rö   r÷   r'  r†  r&   r&   r'   rí   í  s    
   ÿzJukeboxPrior.decodec                 C   s‚   |dk	rD|j d | j }|dd…d|…f |dd…|d…f  }}nd\}}| jr\|  |¡nd\}}| jrt|  |¡n|}|||fS )z“
        Converts the input tokens to input_embeddings. Splits the lyrics form the rest of the metadata. Lyric tokens
        can be None.
        Nr   )NN)rH   r]  r˜  rá  r™  r–  )r‰   rò  r[   Zn_labelsÚlyric_tokensr˜  Zmetadata_posr™  r&   r&   r'   Úget_condû  s    ,ÿzJukeboxPrior.get_condr•   r   r   c
                 C   s<  |dkp|j d dk}
dddœ|
 }t |› d|› d|› d	|› d
|› 	¡ t ¡ à |  ||¡\}}}| jrÜ|
rˆ|  |gd|g¡\}}n|  ||gd|g¡\}}|	dk	r²|	| j7 }	| j	j
|||||||||	d	}|  |¡}nR| j|dd}|
r| j	j||||||||	d}n | j	j
||||||||||	d
}W 5 Q R X |S )a  
        Ancestral/Prime sampling a window of tokens using the provided conditioning and metadatas.

        Args:
            n_samples (`int`):
                Number of samples to generate.
            music_tokens (`List[torch.LongTensor]`, *optional*):
                Previously gemerated tokens at the current level. Used as context for the generation.
            music_tokens_conds (`List[torch.FloatTensor]`, *optional*):
                Upper-level music tokens generated by the previous prior model. Is `None` if the generation is not
                conditionned on the upper-level tokens.
            metadata (`List[torch.LongTensor]`, *optional*):
                List containing the metatdata tensor with the artist, genre and the lyric tokens.
            temp (`float`, *optional*, defaults to 1.0):
                Sampling temperature.
            top_k (`int`, *optional*, defaults to 0):
                Top k probabilities used for filtering.
            top_p (`float`, *optional*, defaults to 0.0):
                Top p probabilities used for filtering.
            chunk_size (`int`, *optional*):
                Size of the chunks used to prepare the cache of the transformer.
            sample_tokens (`int`, *optional*):
                Number of tokens to sample.

        Nr   r   Ú	AncestralÚPrimed)TFz
 sampling z samples with temp=z, top_k=z, top_p=)r±  r$   r%   r¶  r²  T)r|   )r±  r$   r%   r²  )rH   rì  rí  r   rÓ   rø  râ  rô  r]  rV   r½  rõ  Úget_encoder_statesr|   )r‰   r1  rT   rò  r[   r±  r$   r%   r¶  r²  Zno_past_contextÚnamer™  r˜  r÷  r¼  rw  r&   r&   r'   r|     sl    %&
 ÿ
 ÿ
÷øözJukeboxPrior.samplec                 C   sV   | j dkrN| jrN|r$| j |j¡| _|  |ddd¡}| j |¡}| j |¡}nd}|S )z›
        Retreive the last hidden_states of the lyric encoder that will be attended to by the decoder. Forwards through
        the lyric encoder.
        r   N)r]  rÝ  r*  r/   r0   r¹   rç  )r‰   r÷  r|   Z
lyric_actsrw  r&   r&   r'   rû  j  s    zJukeboxPrior.get_encoder_statesc                 C   sP   | j r<| j |¡}tj | d| j¡| d¡¡t 	d¡ }nt
jd|jd}|S )zW
        Computes the loss for the lyric encoder: next lyric token prediction.
        r   r)   r   rn   )rÝ  r*  rÙ  r   ru   Zcross_entropyrt   ræ  rR   rÖ   r   Ztensorr0   )r‰   rw  Ztarget_lyricsÚencoder_lossr&   r&   r'   Úget_encoder_lossy  s     ÿþzJukeboxPrior.get_encoder_lossc                 C   s  |r| j j |¡ |  ||¡\}}}| jr`|  ||gd|g¡\}	}| j |	||d|d\\}
}}n.|  |¡}|  ||¡}
| j |||||d\}}| j|
 | j	 | j
 }||| j | j
 7 }| ¡  ¡ |
 ¡  ¡ | ¡  ¡ dœ}|rð| ¡  ¡ |d< |r| j jj}| j j d¡ |S ||fS dS )z¢
        Applies a forward pass using the conditioning tokens. Different from the classic forward as it does not use the
        vqvae's encoding layers.
        NT)r¢  r¡  )r¡  )Zbpdrý  Únext_token_prediction_lossr³  F)rV   rœ  r  rø  râ  rô  rû  rþ  rÞ  r]  ré  rè  r   rî   rŠ  )r‰   rT   rò  r[   r¡  rF   r™  r˜  r÷  r:   rý  rÿ  r³  rw  r2  rû   rŠ  r&   r&   r'   rN   †  sJ     ÿ    ÿ
û



ý
zJukeboxPrior.forward_tokens)r   r[   rí   r¡  r#  c                 C   sV   |j d }| j||d^}}| j||||d\}}	|rH|  |f|•¡}
nd}
|
||	fS )a÷  
        Encode the hidden states using the `vqvae` encoder, and then predicts the next token in the `forward_tokens`
        function. The loss is the sum of the `encoder` loss and the `decoder` loss.

        Args:
            hidden_states (`torch.Tensor`):
                Hidden states which should be raw audio
            metadata (`List[torch.LongTensor]`, *optional*):
                List containing the metadata conditioning tensorwith the lyric and the metadata tokens.
            decode (`bool`, *optional*, defaults to `False`):
                Whether or not to decode the encoded to tokens.
            get_preds (`bool`, *optional*, defaults to `False`):
                Whether or not to return the actual predicitons of the model.
        r   r&  )rT   rò  r[   r¡  N)rH   rë   rN   rí   )r‰   r   r[   rí   r¡  rY   rT   rò  r2  rû   rà   r&   r&   r'   rŽ   ³  s    
ü
zJukeboxPrior.forward)Nr   NN)F)NNr   )NNr   )NNNr•   r   r   NN)F)FF)r   r‘   r’   rÁ  r   r3  r  r‚   rJ   rï  ró  rô  rõ  r–  rë   rí   rø  r|   rû  rþ  rN   r   r5  r   r   Ú
LongTensorr!   rŽ   r“   r&   r&   rŠ   r'   rØ  é  sL   Y



        ö
_
   ÿ
1  ûúrØ  c                       s4   e Zd ZdZeZdZdZdd„ Z‡ fdd„Z	‡  Z
S )ÚJukeboxPreTrainedModelz†
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    ZjukeboxFc                 C   s$   t |tƒst |tƒr | |j¡ d S r€   )r  rØ  rý   Úapplyr  r
  r&   r&   r'   r  á  s    z$JukeboxPreTrainedModel._init_weightsc                    s   t ƒ j||Ž d S r€   )r   r‚   )r‰   ÚinputsÚkwargsrŠ   r&   r'   r‚   å  s    zJukeboxPreTrainedModel.__init__)r   r‘   r’   rÁ  r   r3  r4  Zsupports_gradient_checkpointingr  r‚   r“   r&   r&   rŠ   r'   r  ×  s   r  a"  
            labels (`List[torch.LongTensor]` of length `n_sample`, and shape `(self.levels, self.config.max_nb_genre + lyric_sequence_length)` :
                List of metadata such as `artist_id`, `genre_id` and the full list of lyric tokens which are used to
                condition the generation.
            sampling_kwargs (`Dict[Any]`):
                Various additional sampling arguments that are used by the `_sample` function. A detail list of the
                arguments can bee seen in the [`_sample`] function documentation.
ao  The bare JUKEBOX Model used for music generation. 4 sampling techniques are supported : `primed_sample`, `upsample`,
    `continue_sample` and `ancestral_sample`. It does not have a `forward` method as the training is not end to end. If
    you want to fine-tune the model, it is recommended to use the `JukeboxPrior` class and train each prior
    individually.
    c                       sô   e Zd ZdgZ‡ fdd„Zdd„ Zd*d	d
„Zd+dd„Zdd„ Zdd„ Z	dd„ Z
dd„ Ze ¡ d,eej dœdd„ƒZedƒd-eej dœdd „ƒZed!eƒeej dœd"d#„ƒZed$eƒeej dœd%d&„ƒZed'eƒeej dœd(d)„ƒZ‡  ZS ).ÚJukeboxModelr€  c                    sJ   t ƒ  ˆ ¡ ˆ j}t|ƒ| _|  ˆ ¡ t ‡ fdd„tˆ j	ƒD ƒ¡| _
d S )Nc                    s   g | ]}t ˆ j| |ƒ‘qS r&   )rØ  Úprior_configsr  r¿   r&   r'   ró   	  s     z)JukeboxModel.__init__.<locals>.<listcomp>)r   r‚   Úvqvae_configrý   rþ   Úset_shared_paramsr   r£   r2   rî  Úpriors)r‰   rW   r  rŠ   r¿   r'   r‚   þ  s    

ÿzJukeboxModel.__init__c                 C   s@   |j D ]4}|j|_|j|_|j|_|j|_|j|_|j|_qdS )zÄ
        Initialises the parameters that are shared. This has to be done here because the list of `JukeboxPriorConfig`
        is nest, and is thus unreachable in the `from_dict` function
        N)r  r  rÍ  rÒ  rÓ  rÎ  r˜  )r‰   Zmodel_configrW   r&   r&   r'   r  	  s    
zJukeboxModel.set_shared_paramsr   Nr   c                 C   s   | j  ||||¡S r€   )rþ   rí   )r‰   rT   rö   r÷   r'  r&   r&   r'   rí   	  s    zJukeboxModel.decodec                 C   s   | j  ||||¡S r€   )rþ   rë   )r‰   rù   rö   r÷   r'  r&   r&   r'   rë   	  s    zJukeboxModel.encodec                    sn   |ˆ  d ˆ  }t |tjƒr,tj|ˆ ddS t |tƒrPtt‡ fdd„|D ƒŽ ƒS |d krbd g| S tdƒ‚d S )Nr   r   r   c                    s   g | ]}t j|ˆ d d‘qS r$  )r   r»  )rò   r`   ©Ú
split_sizer&   r'   ró   	  s     z,JukeboxModel.split_batch.<locals>.<listcomp>zUnknown input type)r  r   r5  r»  r1   rM   rq  )r‰   Úobjr1  r  r·  r&   r
  r'   Úsplit_batch	  s    

zJukeboxModel.split_batchc              	   C   sj   | j | }|| }	|j}
|	jd }||
| k r@|| |d< d}n|
|d< ||
 | }|  |||||||¡S )Nr   r²  r   )r	  r?   rH   Úsample_single_window)r‰   rT   rU   r8   Úsampling_kwargsrX   Ztokens_to_sampleÚmax_batch_sizerV   rü   r?   Znb_sampled_tokensrA   r&   r&   r'   Úsample_partial_window&	  s    

z"JukeboxModel.sample_partial_windowc                 C   sÊ  | j | }|d jd }	|j}
||
 }|| d d …||…f }| dd ¡}d|krZ|| }|jd }||jd  }t d|› d|› d|| › d|› d	¡ |dkr¦|S | |||¡}| ||| j|¡}|  	||	|¡}|  	||	|¡}|  	||	|¡}g }t
t|||ƒd	d
}|D ]z\}}}ddg|jd dk }|jd|› d|› d|› d| j|j › dd |jf |jd |||dœ|—Ž}| |¡ qtj|dd}|d d …| d …f }tj|| |gdd||< |S )Nr   r²  r   rº  z tokens for [rÇ  z]. Conditioning on z tokensFr§  rù  rú  z[prior level z] z
 Sampling z tokens out of Trª  )r1  rT   rò  r[   r   )r	  rH   r?   Úgetrì  rí  ró  rJ   r7   r  r   rM   r®  rë  r|   r>   r   r,   )r‰   rT   rU   r8   r  rX   rA   r  rV   r1  r?   rZ   Zprevious_sampled_tokensr²  Zconditioning_tokensZ
new_tokensrò  r[   r-  Zmusic_tokens_conds_listZmetadata_listr:   r¶   r)  Zmusic_tokens_conds_ir]   rü  r\   rü   Zmusic_tokens_newr&   r&   r'   r  7	  sR    

 ÿ"ýüûz!JukeboxModel.sample_single_windowc	              
   C   s`   || j | jkrFt|| j | j|ƒ}	|	D ]}
|  ||||||
|¡}q(n|  |||||||¡}|S r€   )r	  r?   rB   r  r  )r‰   rT   rU   r8   r  rX   r7   r@   r  r¶   rA   r&   r&   r'   Úsample_levelr	  s,          ÿ      ÿzJukeboxModel.sample_levelé    ç\Âõ(\ï?é   é   FTr"  c                 C   sÚ  | j d }|dk	r|}nt|	| jj ƒ|j |j }|dkrJtt| j ƒƒ}|| _|D ]~}|t| j ƒd krpdn|||dœ}|| j | j }t| jj| | j | j	 ƒ}||kr´|n|}|  
||| ||||||¡}|rT| j || j¡ t ¡ @ t| j ƒ| d }| jj|d|d … ||| jd d}W 5 Q R X d|› }tj |¡sZt |¡ t|||| ¡ d |
rT| j d dk	rT| j d jdkrTt ¡   t||d | j d | jƒ}W 5 Q R X t d	|i|› d
¡ qT|S )aH  
        Core sampling function used to generate music tokens. Iterates over the provided list of levels, while saving
        the generated raw audio at each step.

        Args:
            music_tokens (`List[torch.LongTensor]`):
                A sequence of music tokens of length `self.levels` which will be used as context to continue the
                sampling process. Should have `self.levels` tensors, each corresponding to the generation at a certain
                level.
            labels (`List[torch.LongTensor]`):
                List of length `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre +
                lyric_sequence_length)` metadata such as `artist_id`, `genre_id` and the full list of lyric tokens
                which are used to condition the generation.
            sample_levels (`List[int]`):
                List of the desired levels at which the sampling will be done. A level is equivalent to the index of
                the prior in the list of priors
            metas (`List[Any]`, *optional*):
                Metadatas used to generate the `labels`
            chunk_size (`int`, *optional*, defaults to 32):
                Size of a chunk of audio, used to fill up the memory in chuncks to prevent OOM erros. Bigger chunks
                means faster memory filling but more consumption.
            sampling_temperature (`float`, *optional*, defaults to 0.98):
                Temperature used to ajust the randomness of the sampling.
            lower_batch_size (`int`, *optional*, defaults to 16):
                Maximum batch size for the lower level priors
            max_batch_size (`int`, *optional*, defaults to 16):
                Maximum batch size for the top level priors
            sample_length_in_seconds (`int`, *optional*, defaults to 24):
                Desired length of the generation in seconds
            compute_alignments (`bool`, *optional*, defaults to `False`):
                Whether or not to compute the alignment between the lyrics and the audio using the top_prior
            sample_tokens (`int`, *optional*):
                Precise number of tokens that should be sampled at each level. This is mostly useful for running dummy
                experiments
            offset (`int`, *optional*, defaults to 0):
                Audio offset used as conditioning, corresponds to the starting sample in the music. If the offset is
                greater than 0, the lyrics will be shifted take that intoaccount
            save_results (`bool`, *optional*, defaults to `True`):
                Whether or not to save the intermediate results. If `True`, will generate a folder named with the start
                time.
            sample_length (`int`, *optional*):
                Desired length of the generation in samples.

        Returns: torch.Tensor

        Example:

        ```python
        >>> from transformers import AutoTokenizer, JukeboxModel, set_seed
        >>> import torch

        >>> metas = dict(artist="Zac Brown Band", genres="Country", lyrics="I met a traveller from an antique land")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval()

        >>> labels = tokenizer(**metas)["input_ids"]
        >>> set_seed(0)
        >>> zs = [torch.zeros(1, 0, dtype=torch.long) for _ in range(3)]
        >>> zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
        >>> zs[0]
        tensor([[1853, 1369, 1150, 1869, 1379, 1789,  519,  710, 1306, 1100, 1229,  519,
              353, 1306, 1379, 1053,  519,  653, 1631, 1467, 1229, 1229,   10, 1647,
             1254, 1229, 1306, 1528, 1789,  216, 1631, 1434,  653,  475, 1150, 1528,
             1804,  541, 1804, 1434]])
        ```
        r   Nr   g®Gáz®ï?)r±  r¶  r²  )rö   r'  zjukebox/level_)rh   ri   r_   z/lyric_alignments.pt)r	  r3   rW   r  rë  r2   r+   r7   rI   r?   r  rþ   r/   r0   r   rÓ   rí   rH   Úosrk   ÚexistsÚmakedirsrl   rO   r]  ra   rf   )r‰   rT   rU   Úsample_levelsrh   r¶  Zsampling_temperatureZlower_batch_sizer  r  Zcompute_alignmentsr²  r8   Zsave_resultsrK   Z	top_priorr7   rX   r  Ztotal_token_to_sampler@   rö   rô   Zlogdirr_   r&   r&   r'   Ú_sample‚	  sZ    U
þ
ýø
  ÿ

"
$zJukeboxModel._samplea  
        Generates music tokens based on the provided `labels. Will start at the desired prior level and automatically
        upsample the sequence. If you want to create the audio, you should call `model.decode(tokens)`, which will use
        the VQ-VAE decoder to convert the music tokens to raw audio.

        Args:
            labels (`List[torch.LongTensor]`) :
                List of length `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre +
                lyric_sequence_length)` metadata such as `artist_id`, `genre_id` and the full list of lyric tokens
                which are used to condition the generation.
            n_samples (`int`, *optional*, default to 1) :
                Number of samples to be generated in parallel.
        c                    sN   |  dttt| jƒƒƒ¡}‡ ‡fdd„tt| jƒƒD ƒ}| j|ˆ |f|Ž}|S )aR  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, JukeboxModel, set_seed

        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval()
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/jukebox-1b-lyrics")

        >>> lyrics = "Hey, are you awake? Can you talk to me?"
        >>> artist = "Zac Brown Band"
        >>> genre = "Country"
        >>> metas = tokenizer(artist=artist, genres=genre, lyrics=lyrics)
        >>> set_seed(0)
        >>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length=400)

        >>> with torch.no_grad():
        ...     model.decode(music_tokens)[:, :10].squeeze(-1)
        tensor([[-0.0219, -0.0679, -0.1050, -0.1203, -0.1271, -0.0936, -0.0396, -0.0405,
            -0.0818, -0.0697]])
        ```
        r  c                    s&   g | ]}t jˆd t jˆ d  jd‘qS )r   rC   )r   r-   r.   r0   )rò   ré   ©rU   r1  r&   r'   ró   5
  s    z1JukeboxModel.ancestral_sample.<locals>.<listcomp>©Úpopr1   r2   r+   r	  r  )r‰   rU   r1  r  r  rT   r&   r  r'   Úancestral_sample
  s    'ÿzJukeboxModel.ancestral_sampleaz  Generates a continuation of the previously generated tokens.

        Args:
            music_tokens (`List[torch.LongTensor]` of length `self.levels` ) :
                A sequence of music tokens which will be used as context to continue the sampling process. Should have
                `self.levels` tensors, each corresponding to the generation at a certain level.
        c                 K   s0   |  dttt| jƒƒƒ¡}| j|||f|Ž}|S )Nr  r  ©r‰   rT   rU   r  r  r&   r&   r'   Úcontinue_sample;
  s    zJukeboxModel.continue_samplea„  Upsamples a sequence of music tokens using the prior at level `level`.

        Args:
            music_tokens (`List[torch.LongTensor]` of length `self.levels` ) :
                A sequence of music tokens which will be used as context to continue the sampling process. Should have
                `self.levels` tensors, each corresponding to the generation at a certain level.
        c                 K   s4   |  dttt| jƒd ƒƒ¡}| j|||f|Ž}|S )Nr  r   r  r!  r&   r&   r'   ÚupsampleJ
  s    zJukeboxModel.upsamplea'  Generate a raw audio conditioned on the provided `raw_audio` which is used as conditioning at each of the
        generation levels. The audio is encoded to music tokens using the 3 levels of the VQ-VAE. These tokens are
        used: as conditioning for each level, which means that no ancestral sampling is required.

        Args:
            raw_audio (`List[torch.Tensor]` of length `n_samples` ) :
                A list of raw audio that will be used as conditioning information for each samples that will be
                generated.
        c              	   K   sv   |  dttt| jƒƒƒ¡}| j |j¡ ¡  t	 
¡ & | jj|dt| jƒ|jd d}W 5 Q R X | j|||f|Ž}|S )Nr  r   rö  )r  r1   r2   r+   r	  rþ   r/   r0   rO   r   rÓ   rë   rH   r  )r‰   rô   rU   r  r  rT   r&   r&   r'   r½  Y
  s    
   ÿzJukeboxModel.primed_sample)r   Nr   )r   Nr   )Nr  r  r  r  r  FNr   TN)r   )r   r‘   r’   Z_no_split_modulesr‚   r  rí   rë   r  r  r  r  r   rÓ   r   r   r  r
   r   Ú JUKEBOX_SAMPLING_INPUT_DOCSTRINGr"  r#  r½  r“   r&   r&   rŠ   r'   r  ó  sX   		

;           ñð ÿø
ø
	ör  )DrÁ  r¡   r  Útypingr   r   r   rQ   rR   r   Ztorch.nn.functionalr   ru   r   Ztorch.nnr   ZFusedLayerNormr¤  r   Zmodeling_utilsr	   Úutilsr
   r   Zutils.loggingr   Zconfiguration_jukeboxr   r   r   r   Z
get_loggerr   rì  Z%JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LISTrO   r(   r<   rB   ra   rl   r~   ÚModuler   r”   r    r¨   r°   r¸   r»   r¾   rñ   ZJUKEBOX_START_DOCSTRINGrý   r6  r>  rF  r€  r‡  r‘  r“  r¾  rÂ  rÌ  rØ  r  r$  r  r&   r&   r&   r'   Ú<module>   s‚   
þ&"
9 *û :  x<
  L-08   q
ú