U
    ,-e                     @   s  U d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% e rddl&m'Z' ndZ'e(e)Z*er$e Z+ee,ee
e, e
e, f f e-d< n$ede r4dnde rBdndffdde rXdndffdde rndndffdde rdnde rdndffddde rdndffde rd nddffd!d"d#e rd$nde rd%ndffd&d'e rd(ndffd)d*d+d,de rdndffd-d.e r.d/ndffd0de rDd1ndffd2d3e rZd4ndffd5de rpdndffd6d7e rd8nde rd9ndffd:d;de rdndffd<d3e rd4ndffd=d>e rd?ndffd@d>e rd?ndffdAe rdBnde rdCndffdDdEe r&dFndffdGdHe r<dIndffdJe rPdKnde r^dLndffdMdNdOdPd3e rzd4ndffdQdRe rdSndffdTe rdUnde rdVndffdWdXe rdYndffdZd[e rd\ndffd]d^e rd_ndffd`de r
dndffdae rdbnddffdcdddedfe r:dgndffdhdidje rRdkndffdlde rhdndffdme r|dnnddffdod.e rd/ndffdpd.e rd/ndffdqd.e rd/ndffdrde rdsndffdtdud.e rd/ndffdvdwd>e rd?ndffdxdye rdzndffd{d|d3e r4d4ndffd}de rJd~ndffdd.e r`d/ndffddde rxdndffdde rdndffdde rdndffdde rdndffdde rdndffdde rdndffde rdnde rd~ndffdde rdndffde r2dnde r@dndffddde rXdndffde rldnddffde rdnddffde rdnde rdndffde rdnde rdndffdd3e rd4ndffdde rdndffdde r
dnde rd~ndffde r,dnddffdde rDdndffdde rZdndffdde rpdsndffdd3e rd4ndffde rdnde rdndffdde rdndffdde rdndffdde rdndffde rdnde rdndffde r dnde r.dndffde rBdnde rPdndffdd>e rfd?ndffdde r|dndffdd.e rd/ndffdd>e rd?ndffde rd'nde rd(ndffde rd'nde rd(ndffdde 	rdnde 	rd~ndffddde 	r(dndffde 	r<dnddffddde 	rVdndffddde 	rndndffde 	rdnde 	rdndffde 	rdnde 	rdndffdde 	rdndffdd3e 	rd4ndffdd3e 	rd4ndffddde 
rdndffdde 
r"dsndffde 
r6dnddffdde 
rNdnddffddde 
rhdndffde 
r|dnde 
rdndffde 
rdnde 
rdndffdddde 
rdnde 
rdndffdde 
rdndffdde r dndffdddddde rdndffdd>e r4d?ndffde rJd nde rZdndffdde rvdnddffde rdnde rdndffde rdnde rȐdndffd	e rd
nde rdndffde rdnde rdndffde r.dnde r<dndffgZ+ee!e+Z.dd e!/ D Z0e,dddZ1dee,ej2f e
ee,ej2f  e3e3e
e	e,e,f  e
ee3e,f  e
e, e3e,d	ddZ4G dd dZ5dS (  z Auto Tokenizer class.    N)OrderedDict)TYPE_CHECKINGDictOptionalTupleUnion   )PretrainedConfig)get_class_from_dynamic_moduleresolve_trust_remote_code)PreTrainedTokenizer)TOKENIZER_CONFIG_FILE)cached_fileextract_commit_hashis_sentencepiece_availableis_tokenizers_availablelogging   )EncoderDecoderConfig   )_LazyAutoMapping)CONFIG_MAPPING_NAMES
AutoConfigconfig_class_to_model_typemodel_type_to_module_name!replace_list_option_in_docstrings)PreTrainedTokenizerFastTOKENIZER_MAPPING_NAMESZalbertZAlbertTokenizerZAlbertTokenizerFastalignZBertTokenizerZBertTokenizerFastZbark)Zbart)ZBartTokenizerZBartTokenizerFastZbarthezZBarthezTokenizerZBarthezTokenizerFast)Zbartpho)ZBartphoTokenizerNZbertzbert-generationZBertGenerationTokenizer)zbert-japanese)ZBertJapaneseTokenizerN)Zbertweet)ZBertweetTokenizerNZbig_birdZBigBirdTokenizerZBigBirdTokenizerFastZbigbird_pegasusZPegasusTokenizerZPegasusTokenizerFast)Zbiogpt)ZBioGptTokenizerN)Z
blenderbot)ZBlenderbotTokenizerZBlenderbotTokenizerFast)zblenderbot-small)ZBlenderbotSmallTokenizerNZblipzblip-2ZGPT2TokenizerZGPT2TokenizerFastZbloomZBloomTokenizerFastZbridgetowerZRobertaTokenizerZRobertaTokenizerFastZbros)Zbyt5)ZByT5TokenizerNZ	camembertZCamembertTokenizerZCamembertTokenizerFast)Zcanine)ZCanineTokenizerNZchinese_clipclapZclipZCLIPTokenizerZCLIPTokenizerFastZclipsegZ
code_llamaZCodeLlamaTokenizerZCodeLlamaTokenizerFastZcodegenZCodeGenTokenizerZCodeGenTokenizerFastZconvbertZConvBertTokenizerZConvBertTokenizerFastZcpmZCpmTokenizerZCpmTokenizerFast)Zcpmant)ZCpmAntTokenizerN)Zctrl)ZCTRLTokenizerN)zdata2vec-audioZWav2Vec2CTCTokenizerNzdata2vec-textZdebertaZDebertaTokenizerZDebertaTokenizerFastz
deberta-v2ZDebertaV2TokenizerZDebertaV2TokenizerFastZ
distilbertZDistilBertTokenizerZDistilBertTokenizerFastZdprZDPRQuestionEncoderTokenizerZDPRQuestionEncoderTokenizerFastZelectraZElectraTokenizerZElectraTokenizerFastZernieZernie_mZErnieMTokenizer)Zesm)ZEsmTokenizerN)Zflaubert)ZFlaubertTokenizerNZfnetZFNetTokenizerZFNetTokenizerFast)Zfsmt)ZFSMTTokenizerNZfunnelZFunnelTokenizerZFunnelTokenizerFastgitzgpt-sw3ZGPTSw3TokenizerZgpt2Zgpt_bigcodeZgpt_neoZgpt_neoxZGPTNeoXTokenizerFast)Zgpt_neox_japanese)ZGPTNeoXJapaneseTokenizerNZgptj)zgptsan-japanese)ZGPTSanJapaneseTokenizerNZgroupvitZherbertZHerbertTokenizerZHerbertTokenizerFast)Zhubertr    ZibertZideficsZLlamaTokenizerFastZinstructblip)Zjukebox)ZJukeboxTokenizerNZlayoutlmZLayoutLMTokenizerZLayoutLMTokenizerFastZ
layoutlmv2ZLayoutLMv2TokenizerZLayoutLMv2TokenizerFastZ
layoutlmv3ZLayoutLMv3TokenizerZLayoutLMv3TokenizerFastZ	layoutxlmZLayoutXLMTokenizerZLayoutXLMTokenizerFastZledZLEDTokenizerZLEDTokenizerFastZliltllamaZLlamaTokenizerZ
longformerZLongformerTokenizerZLongformerTokenizerFastZlongt5ZT5TokenizerZT5TokenizerFast)Zluke)ZLukeTokenizerNZlxmertZLxmertTokenizerZLxmertTokenizerFastZm2m_100ZM2M100TokenizerZmarianZMarianTokenizerZmbartZMBartTokenizerZMBartTokenizerFastZmbart50ZMBart50TokenizerZMBart50TokenizerFastmegazmegatron-bert)zmgp-str)ZMgpstrTokenizerNZmistralZmlukeZMLukeTokenizerZ
mobilebertZMobileBertTokenizerZMobileBertTokenizerFastZmpnetZMPNetTokenizerZMPNetTokenizerFastZmptZmraZmt5ZMT5TokenizerZMT5TokenizerFastZmusicgenZmvpZMvpTokenizerZMvpTokenizerFastZnezhaZnllbZNllbTokenizerZNllbTokenizerFastznllb-moeZnystromformerZ	oneformerz
openai-gptZOpenAIGPTTokenizerZOpenAIGPTTokenizerFastoptZowlvitZpegasusZ	pegasus_x)Z	perceiver)ZPerceiverTokenizerNZ	persimmon)Zphobert)ZPhobertTokenizerNZ
pix2structZplbartZPLBartTokenizer)Z
prophetnet)ZProphetNetTokenizerNZqdqbert)Zrag)ZRagTokenizerNrealmZRealmTokenizerZRealmTokenizerFastZreformerZReformerTokenizerZReformerTokenizerFastZrembertZRemBertTokenizerZRemBertTokenizerFastZ	retribertZRetriBertTokenizerZRetriBertTokenizerFastZrobertazroberta-prelayernorm)Zroc_bert)ZRoCBertTokenizerNZroformerZRoFormerTokenizerZRoFormerTokenizerFastZrwkvZspeech_to_textZSpeech2TextTokenizer)Zspeech_to_text_2)ZSpeech2Text2TokenizerNZspeecht5ZSpeechT5Tokenizer)Zsplinter)ZSplinterTokenizerZSplinterTokenizerFastZsqueezebertZSqueezeBertTokenizerZSqueezeBertTokenizerFastZswitch_transformersZt5)Ztapas)ZTapasTokenizerN)Ztapex)ZTapexTokenizerN)z
transfo-xl)ZTransfoXLTokenizerNZumt5ZviltZvisual_bert)Zvits)ZVitsTokenizerN)Zwav2vec2r    )zwav2vec2-conformerr    )Zwav2vec2_phoneme)ZWav2Vec2PhonemeCTCTokenizerNZwhisperZWhisperTokenizerZWhisperTokenizerFastZxclipZxglmZXGLMTokenizerZXGLMTokenizerFast)Zxlm)ZXLMTokenizerNzxlm-prophetnetZXLMProphetNetTokenizerzxlm-robertaZXLMRobertaTokenizerZXLMRobertaTokenizerFastzxlm-roberta-xlZxlnetZXLNetTokenizerZXLNetTokenizerFastZxmodZyosoc                 C   s   i | ]\}}||qS  r&   ).0kvr&   r&   k/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/auto/tokenization_auto.py
<dictcomp>  s      r+   )
class_namec              	   C   s   | dkrt S t D ]T\}}| |krt|}td| d}zt|| W   S  tk
rf   Y qY qX qtj	 D ].\}}|D ] }t|dd | kr|    S qqttd}t
|| rt|| S d S )Nr   .ztransformers.models__name__Ztransformers)r   r   itemsr   	importlibimport_modulegetattrAttributeErrorTOKENIZER_MAPPING_extra_contenthasattr)r,   module_nameZ
tokenizersmoduleconfigZ	tokenizerZmain_moduler&   r&   r*   tokenizer_class_from_name  s$    


r:   F )	pretrained_model_name_or_path	cache_dirforce_downloadresume_downloadproxiestokenrevisionlocal_files_only	subfolderc	                 K   s   |	 dd}
|
dk	r4tdt |dk	r0td|
}|	dd}t| t||||||||dd|d}|dkrxt	d i S t
||}t|d	d
}t|}W 5 Q R X ||d< |S )a  
    Loads the tokenizer configuration from a pretrained model tokenizer configuration.

    Args:
        pretrained_model_name_or_path (`str` or `os.PathLike`):
            This can be either:

            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
              under a user or organization name, like `dbmdz/bert-base-german-cased`.
            - a path to a *directory* containing a configuration file saved using the
              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.

        cache_dir (`str` or `os.PathLike`, *optional*):
            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
            cache should not be used.
        force_download (`bool`, *optional*, defaults to `False`):
            Whether or not to force to (re-)download the configuration files and override the cached versions if they
            exist.
        resume_download (`bool`, *optional*, defaults to `False`):
            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
        proxies (`Dict[str, str]`, *optional*):
            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
        token (`str` or *bool*, *optional*):
            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
            when running `huggingface-cli login` (stored in `~/.huggingface`).
        revision (`str`, *optional*, defaults to `"main"`):
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
            identifier allowed by git.
        local_files_only (`bool`, *optional*, defaults to `False`):
            If `True`, will only try to load the tokenizer configuration from local files.
        subfolder (`str`, *optional*, defaults to `""`):
            In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can
            specify the folder name here.

    <Tip>

    Passing `token=True` is required when you want to use a private model.

    </Tip>

    Returns:
        `Dict`: The configuration of the tokenizer.

    Examples:

    ```python
    # Download configuration from huggingface.co and cache.
    tokenizer_config = get_tokenizer_config("bert-base-uncased")
    # This model does not have a tokenizer config so the result will be an empty dict.
    tokenizer_config = get_tokenizer_config("xlm-roberta-base")

    # Save a pretrained tokenizer locally and you can reload its config
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    tokenizer.save_pretrained("tokenizer-test")
    tokenizer_config = get_tokenizer_config("tokenizer-test")
    ```use_auth_tokenNVThe `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.V`token` and `use_auth_token` are both specified. Please set only the argument `token`._commit_hashF)r=   r>   r?   r@   rA   rB   rC   rD   Z%_raise_exceptions_for_missing_entriesZ'_raise_exceptions_for_connection_errorsrH   z\Could not locate the tokenizer configuration file, will try to use the model config instead.zutf-8)encoding)popwarningswarnFutureWarning
ValueErrorgetr   r   loggerinfor   openjsonload)r<   r=   r>   r?   r@   rA   rB   rC   rD   kwargsrE   Zcommit_hashZresolved_config_filereaderresultr&   r&   r*   get_tokenizer_config  sB    I 

rX   c                   @   s6   e Zd ZdZdd Zeeedd Zd
dd	Z	dS )AutoTokenizera  
    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
    created with the [`AutoTokenizer.from_pretrained`] class method.

    This class cannot be instantiated directly using `__init__()` (throws an error).
    c                 C   s   t dd S )Nz}AutoTokenizer is designed to be instantiated using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.)EnvironmentError)selfr&   r&   r*   __init__8  s    zAutoTokenizer.__init__c              	   O   s  | dd}|dk	r@tdt |dddk	r8td||d< | dd}d|d< | d	d}| d
d}| dd}|dk	r(d}	t|d}
|
dkrtd| dddd t D  d|
\}}|r|dk	rt	|}	n
t
d |	dkrt	|}	|	dkrtd| d|	j|f||S t|f|}d|krJ|d |d< |d}d}d|krt|d ttfr|d }n|d dd}|dkrt|tstj|fd|i|}|j}t|drd|jkr|jd }|dk	}|dk	p t|tk}t||||}|r|r|r:|d dk	r:|d }n|d }t||f|}	| dd}tj|rr|	  |	j|f||S |dk	rd}	|r|ds| d}t	|}	|	dkr|}t	|}	|	dkrtd| d|	j|f||S t|tr@t|j t|j!k	r:t
d|j!j" d|j j" d  |j!}t#t|j$}|dk	rtt| \}}|r|s~|dkr|j|f||S |dk	r|j|f||S td!td"|j" d#dd$d t D  ddS )%a8  
        Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.

        The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
        falling back to using pattern matching on `pretrained_model_name_or_path`:

        List options

        Params:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                Can be either:

                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                      user or organization name, like `dbmdz/bert-base-german-cased`.
                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
                      single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
                      applicable to all derived classes)
            inputs (additional positional arguments, *optional*):
                Will be passed along to the Tokenizer `__init__()` method.
            config ([`PretrainedConfig`], *optional*)
                The configuration object used to determine the tokenizer class to instantiate.
            cache_dir (`str` or `os.PathLike`, *optional*):
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download the model weights and configuration files and override the
                cached versions if they exist.
            resume_download (`bool`, *optional*, defaults to `False`):
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.
            subfolder (`str`, *optional*):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
            use_fast (`bool`, *optional*, defaults to `True`):
                Use a [fast Rust-based tokenizer](https://huggingface.co/docs/tokenizers/index) if it is supported for
                a given model. If a fast tokenizer is not available for a given model, a normal Python-based tokenizer
                is returned instead.
            tokenizer_type (`str`, *optional*):
                Tokenizer type to be loaded.
            trust_remote_code (`bool`, *optional*, defaults to `False`):
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
                should only be set to `True` for repositories you trust and in which you have read the code, as it will
                execute code present on the Hub on your local machine.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
                `additional_special_tokens`. See parameters in the `__init__()` for more details.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer

        >>> # Download vocabulary from huggingface.co and cache.
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

        >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
        >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")

        >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
        >>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")

        >>> # Download vocabulary from huggingface.co and define model-specific arguments
        >>> tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)
        ```rE   NrF   rA   rG   r9   TZ
_from_autouse_fasttokenizer_typetrust_remote_codezPassed `tokenizer_type` z3 does not exist. `tokenizer_type` should be one of z, c                 s   s   | ]
}|V  qd S Nr&   r'   cr&   r&   r*   	<genexpr>  s     z0AutoTokenizer.from_pretrained.<locals>.<genexpr>r-   zt`use_fast` is set to `True` but the tokenizer class does not have a fast version.  Falling back to the slow version.zTokenizer class z is not currently imported.rH   tokenizer_classauto_maprY   r   r   Zcode_revisionZFastz- does not exist or is not currently imported.z The encoder model config class: z3 is different from the decoder model config class: z. It is not recommended to use the `AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder specific tokenizer classes.zzThis tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed in order to use this tokenizer.z!Unrecognized configuration class z8 to build an AutoTokenizer.
Model type should be one of c                 s   s   | ]}|j V  qd S r`   )r.   ra   r&   r&   r*   rc     s     )%rJ   rK   rL   rM   rO   rN   r   joinkeysr:   rP   warningfrom_pretrainedrX   
isinstancetuplelistr	   r   rd   r6   re   typer4   r   r
   ospathisdirZregister_for_auto_classendswithr   decoderencoder	__class__r   r.   )clsr<   inputsrU   rE   r9   r]   r^   r_   rd   Ztokenizer_class_tupleZtokenizer_class_nameZtokenizer_fast_class_nameZtokenizer_configZconfig_tokenizer_classZtokenizer_auto_mapZhas_remote_codeZhas_local_codeZ	class_ref_Ztokenizer_class_candidateZ
model_typeZtokenizer_class_pyZtokenizer_class_fastr&   r&   r*   ri   >  s    O 
$







   







&zAutoTokenizer.from_pretrainedNFc                 C   s   |dkr|dkrt d|dk	r2t|tr2t d|dk	rLt|trLt d|dk	r|dk	rt|tr|j|krt d|j d| d| tjkrt|  \}}|dkr|}|dkr|}tj| ||f|d dS )	a  
        Register a new tokenizer in this mapping.


        Args:
            config_class ([`PretrainedConfig`]):
                The configuration corresponding to the model to register.
            slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
                The slow tokenizer to register.
            fast_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
                The fast tokenizer to register.
        NzKYou need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_classz:You passed a fast tokenizer in the `slow_tokenizer_class`.z:You passed a slow tokenizer in the `fast_tokenizer_class`.zThe fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not consistent with the slow tokenizer class you passed (fast tokenizer has z and you passed z!. Fix one of those so they match!)exist_ok)rN   
issubclassr   r   slow_tokenizer_classr4   r5   register)Zconfig_classrz   Zfast_tokenizer_classrx   Zexisting_slowZexisting_fastr&   r&   r*   r{     s0    
zAutoTokenizer.register)NNF)
r.   
__module____qualname____doc__r\   classmethodr   r   ri   r{   r&   r&   r&   r*   rY   0  s    QrY   )NFFNNNFr;   )6r~   r0   rS   rn   rK   collectionsr   typingr   r   r   r   r   Zconfiguration_utilsr	   Zdynamic_module_utilsr
   r   Ztokenization_utilsr   Ztokenization_utils_baser   utilsr   r   r   r   r   Zencoder_decoderr   Zauto_factoryr   Zconfiguration_autor   r   r   r   r   Ztokenization_utils_fastr   Z
get_loggerr.   rP   r   str__annotations__r4   r/   ZCONFIG_TO_TYPEr:   PathLikeboolrX   rY   r&   r&   r&   r*   <module>   s  	
*    o
       m