U
    ,-eE                     @   s   d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZmZmZ ddlZddlZddlmZmZ ddlmZ ddlmZmZmZmZmZ dd	lmZmZ ee Z!d
dddZ"ddiddiddidZ#ddiZ$G dd deZ%dS )z(Tokenization classes for OpenAI Jukebox.    N)INFINITY)AnyDictListOptionalTupleUnion   )
AddedTokenPreTrainedTokenizer)BatchEncoding)
TensorTypeis_flax_availableis_tf_availableis_torch_availablelogging)_is_jax	_is_numpyzartists.jsonzlyrics.jsonzgenres.json)artists_filelyrics_filegenres_fileZjukeboxz=https://huggingface.co/ArthurZ/jukebox/blob/main/artists.jsonz<https://huggingface.co/ArthurZ/jukebox/blob/main/genres.jsonz<https://huggingface.co/ArthurZ/jukebox/blob/main/lyrics.json)r   r   r      c                       s  e Zd ZdZeZeZeZ	ddgZ
dddgdddf fd	d
	Zedd Zdd Zdd Zdd Zdd Zd/eeeeeeeeeeef f dddZdd ZeedddZee eddd Zd0eeeef  ed"d#d$Zd1e d'd(d)Z!d2eee ee d*d+d,Z"d-d. Z#  Z$S )3JukeboxTokenizeraq  
    Constructs a Jukebox tokenizer. Jukebox can be conditioned on 3 different inputs :
        - Artists, unique ids are associated to each artist from the provided dictionary.
        - Genres, unique ids are associated to each genre from the provided dictionary.
        - Lyrics, character based tokenization. Must be initialized with the list of characters that are inside the
        vocabulary.

    This tokenizer does not require training. It should be able to process a different number of inputs:
    as the conditioning of the model can be done on the three different queries. If None is provided, defaults values will be used.:

    Depending on the number of genres on which the model should be conditioned (`n_genres`).
    ```python
    >>> from transformers import JukeboxTokenizer

    >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
    >>> tokenizer("Alan Jackson", "Country Rock", "old town road")["input_ids"]
    [tensor([[   0,    0,    0, 6785,  546,   41,   38,   30,   76,   46,   41,   49,
               40,   76,   44,   41,   27,   30]]), tensor([[  0,   0,   0, 145,   0]]), tensor([[  0,   0,   0, 145,   0]])]
    ```

    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.

    <Tip>

    If nothing is provided, the genres and the artist will either be selected randomly or set to None

    </Tip>

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to:
    this superclass for more information regarding those methods.

    However the code does not allow that and only supports composing from various genres.

    Args:
        artists_file (`str`):
            Path to the vocabulary file which contains a mapping between artists and ids. The default file supports
            both "v2" and "v3"
        genres_file (`str`):
            Path to the vocabulary file which contain a mapping between genres and ids.
        lyrics_file (`str`):
            Path to the vocabulary file which contains the accepted characters for the lyrics tokenization.
        version (`List[str]`, `optional`, default to `["v3", "v2", "v2"]`) :
            List of the tokenizer versions. The `5b-lyrics`'s top level prior model was trained using `v3` instead of
            `v2`.
        n_genres (`int`, `optional`, defaults to 1):
            Maximum number of genres to use for composition.
        max_n_lyric_tokens (`int`, `optional`, defaults to 512):
            Maximum number of lyric tokens to keep.
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    	input_idsZattention_maskv3v2r      z<|endoftext|>c              	      s0  t |trt|dddn|}|| _|| _|| _d|i| _t|dd}	t	|	| _
W 5 Q R X t|dd}	t	|	| _W 5 Q R X t|dd}	t	|	| _W 5 Q R X d}
t| jdkr|
dd	}
t|
| _d
d | j
 D | _dd | j D | _dd | j D | _t jf ||||d| d S )NF)lstriprstripr   utf-8encoding#[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+O   z\-'z\-+'c                 S   s   i | ]\}}||qS  r$   .0kvr$   r$   q/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/jukebox/tokenization_jukebox.py
<dictcomp>   s      z-JukeboxTokenizer.__init__.<locals>.<dictcomp>c                 S   s   i | ]\}}||qS r$   r$   r%   r$   r$   r)   r*      s      c                 S   s   i | ]\}}||qS r$   r$   r%   r$   r$   r)   r*      s      )	unk_tokenn_genresversionmax_n_lyric_tokens)
isinstancestrr
   r-   r.   r,   Z_added_tokens_decoderopenjsonloadartists_encodergenres_encoderlyrics_encoderlenreplaceregexcompileout_of_vocabitemsartists_decodergenres_decoderlyrics_decodersuper__init__)selfr   r   r   r-   r.   r,   r+   kwargsZvocab_handleZoov	__class__r$   r)   rA   w   s4    
zJukeboxTokenizer.__init__c                 C   s   t | jt | j t | j S )N)r7   r4   r5   r6   rB   r$   r$   r)   
vocab_size   s    zJukeboxTokenizer.vocab_sizec                 C   s   | j | j| jdS )Nr4   r5   r6   rH   rF   r$   r$   r)   	get_vocab   s    zJukeboxTokenizer.get_vocabc                    s    fdd|D }t t|D ]B} fdd|| D ||< || dg jt||    ||< q fdd|d D g g g}|||fS )zConverts the artist, genre and lyrics tokens to their index using the vocabulary.
        The total_length, offset and duration have to be provided in order to select relevant lyrics and add padding to
        the lyrics token sequence.
        c                    s   g | ]} j |d qS r   )r4   get)r&   artistrF   r$   r)   
<listcomp>   s     z9JukeboxTokenizer._convert_token_to_id.<locals>.<listcomp>c                    s   g | ]} j |d qS rJ   )r5   rK   r&   genrerF   r$   r)   rM      s     c                    s   g | ]} j |d qS rJ   )r6   rK   r&   	characterrF   r$   r)   rM      s     r   )ranger7   r,   )rB   Zlist_artistsZlist_genresZlist_lyrics
artists_idgenresZ	lyric_idsr$   rF   r)   _convert_token_to_id   s    &z%JukeboxTokenizer._convert_token_to_idc                 C   s   t |S )aQ  
        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).

        Do NOT take care of added tokens. Only the lyrics are split into character for the character-based vocabulary.
        )listrB   lyricsr$   r$   r)   	_tokenize   s    zJukeboxTokenizer._tokenizec                 K   s(   |  |||\}}}| |}|||fS )zV
        Converts three strings in a 3 sequence of tokens using the tokenizer
        )prepare_for_tokenizationrZ   )rB   rL   rO   rY   rC   r$   r$   r)   tokenize   s    
zJukeboxTokenizer.tokenizeF)artistsrU   rY   is_split_into_wordsreturnc                    s8  t t jD ]l} j| dkrD||  ||< ||  g||< q || d ||<  fdd|| dD ||< q jd dkrtd _d	fd
dt tD  _	d j	d< td  _
 j	 _dd  j	 D  _d jd< ntd _ |}|dd} jd|g g f}|||fS )a  
        Performs any necessary transformations before tokenization.

        Args:
            artist (`str`):
                The artist name to prepare. This will mostly lower the string
            genres (`str`):
                The genre name to prepare. This will mostly lower the string.
            lyrics (`str`):
                The lyrics to prepare.
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize. This is useful for NER or token classification.
        r   .v2c                    s   g | ]}  |d  qS )r`   )
_normalizerN   rF   r$   r)   rM      s    z=JukeboxTokenizer.prepare_for_tokenization.<locals>.<listcomp>_r   r   r"   zOABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-+'"()[] 	
c                    s   i | ]} | |d  qS )   r$   )r&   index)vocabr$   r)   r*      s      z=JukeboxTokenizer.prepare_for_tokenization.<locals>.<dictcomp>z<unk>rc   c                 S   s   i | ]\}}||qS r$   r$   r%   r$   r$   r)   r*      s       z$[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]+\
)rS   r7   r-   lowerra   splitr9   r:   r;   re   Zn_vocabr6   r<   r?   _run_strip_accentsr8   sub)rB   r]   rU   rY   r^   idxr$   )rB   re   r)   r[      s,    


z)JukeboxTokenizer.prepare_for_tokenizationc                 C   sB   t d|}g }|D ]"}t |}|dkr,q|| qd|S )z$Strips accents from a piece of text.ZNFDZMnrf   )unicodedata	normalizecategoryappendjoin)rB   textoutputcharcatr$   r$   r)   rk      s    
z#JukeboxTokenizer._run_strip_accents)rs   r_   c                    s   dd t tdtdd D dd t tdtdd D  d	d t td
tdd D  dg  t  td}d fdd| D }|d|d}|S )z
        Normalizes the input text. This process is for the genres and the artist

        Args:
            text (`str`):
                Artist or Genre string to normalize
        c                 S   s   g | ]}t |qS r$   chrr&   ir$   r$   r)   rM     s     z/JukeboxTokenizer._normalize.<locals>.<listcomp>azrc   c                 S   s   g | ]}t |qS r$   rw   ry   r$   r$   r)   rM     s     AZc                 S   s   g | ]}t |qS r$   rw   ry   r$   r$   r)   rM     s     09.z_+rf   c                    s   g | ]}| kr|nd qS )rb   r$   )r&   cacceptedr$   r)   rM     s     rb   )	rS   ord	frozensetrer:   rr   ri   rl   strip)rB   rs   patternr$   r   r)   ra     s    

zJukeboxTokenizer._normalize)rY   r_   c                 C   s
   d |S )N )rr   rX   r$   r$   r)   convert_lyric_tokens_to_string  s    z/JukeboxTokenizer.convert_lyric_tokens_to_stringN)tensor_typeprepend_batch_axisc           	      C   s   t |tst|}|tjkr@t s*tdddl}|j}|j}nh|tjkrnt	 sXtdddl
}|j}|j}n:|tjkrt stdddlm} |j}t}n
tj}t}z|r|g}||s||}W n   tdY nX |S )a  
        Convert the inner content to tensors.

        Args:
            tensor_type (`str` or [`~utils.TensorType`], *optional*):
                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
                unset, no modification is done.
            prepend_batch_axis (`int`, *optional*, defaults to `False`):
                Whether or not to add the batch dimension during the conversion.
        zSUnable to convert output to TensorFlow tensors format, TensorFlow is not installed.r   NzMUnable to convert output to PyTorch tensors format, PyTorch is not installed.zEUnable to convert output to JAX tensors format, JAX is not installed.zUnable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length.)r/   r   Z
TENSORFLOWr   ImportErrorZ
tensorflowZconstant	is_tensorZPYTORCHr   torchZtensorZJAXr   Z	jax.numpynumpyarrayr   npZasarrayr   
ValueError)	rB   inputsr   r   tfZ	as_tensorr   r   Zjnpr$   r$   r)   convert_to_tensors  sD    




z#JukeboxTokenizer.convert_to_tensorsrf   pt)r_   c           	         s   dddg|gt j }|gt j }|||\}}}|||\ t gt d  } fddtt jD t|dS )a\  Convert the raw string to a list of token ids

        Args:
            artist (`str`):
                Name of the artist.
            genres (`str`):
                List of genres that will be mixed to condition the audio
            lyrics (`str`, *optional*, defaults to `""`):
                Lyrics used to condition the generation
        r   rP   c                    s6   g | ].}j  | g |  |  gd qS ))r   )r   ry   rT   Zfull_tokensZ
genres_idsr   return_tensorsrB   r$   r)   rM   l  s
    z-JukeboxTokenizer.__call__.<locals>.<listcomp>)r   attention_masks)r7   r-   r\   rV   r   rS   r   )	rB   rL   rU   rY   r   Zartists_tokensZgenres_tokensZlyrics_tokensr   r$   r   r)   __call__Y  s    
zJukeboxTokenizer.__call__)save_directoryfilename_prefixr_   c              	   C   s"  t j|s"td| d dS t j||r6|d ndtd  }t|ddd	}|t	j
| jd
d W 5 Q R X t j||r|d ndtd  }t|ddd	}|t	j
| jd
d W 5 Q R X t j||r|d ndtd  }t|ddd	}|t	j
| jd
d W 5 Q R X |||fS )a  
        Saves the tokenizer's vocabulary dictionary to the provided save_directory.

        Args:
            save_directory (`str`):
                A path to the directory where to saved. It will be created if it doesn't exist.

            filename_prefix (`Optional[str]`, *optional*):
                A prefix to add to the names of the files saved by the tokenizer.

        zVocabulary path (z) should be a directoryN-rf   r   wr   r    F)ensure_asciir   r   )ospathisdirloggererrorrr   VOCAB_FILES_NAMESr1   writer2   dumpsr4   r5   r6   )rB   r   r   r   fr   r   r$   r$   r)   save_vocabularyt  s,          z JukeboxTokenizer.save_vocabularyc                    s:    j |} fdd|D } fdd|D }|||fS )a  
        Converts an index (integer) in a token (str) using the vocab.

        Args:
            artists_index (`int`):
                Index of the artist in its corresponding dictionary.
            genres_index (`Union[List[int], int]`):
               Index of the genre in its corresponding dictionary.
            lyric_index (`List[int]`):
                List of character indices, which each correspond to a character.
        c                    s   g | ]} j |qS r$   )r>   rK   rN   rF   r$   r)   rM     s     z9JukeboxTokenizer._convert_id_to_token.<locals>.<listcomp>c                    s   g | ]} j |qS r$   )r?   rK   rQ   rF   r$   r)   rM     s     )r=   rK   )rB   Zartists_indexZgenres_indexZlyric_indexrL   rU   rY   r$   rF   r)   _convert_id_to_token  s    z%JukeboxTokenizer._convert_id_to_token)F)NF)rf   r   )N)%__name__
__module____qualname____doc__r   Zvocab_files_namesPRETRAINED_VOCAB_FILES_MAPZpretrained_vocab_files_mapPRETRAINED_LYRIC_TOKENS_SIZESZmax_lyric_input_sizeZmodel_input_namesrA   propertyrG   rI   rV   rZ   r\   r0   boolr   r   r   r[   rk   ra   r   r   r   r   r   r   r   r   r   r   __classcell__r$   r$   rD   r)   r   ;   sH   6+

	    -    =$r   )&r   r2   r   r   rn   Zjson.encoderr   typingr   r   r   r   r   r   r   r   r9   Ztokenization_utilsr
   r   Ztokenization_utils_baser   utilsr   r   r   r   r   Zutils.genericr   r   Z
get_loggerr   r   r   r   r   r   r$   r$   r$   r)   <module>   s>    
    