U
    ,-e                    @   sb  d Z ddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlZddlZddlZddlZddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZ ddlmZmZm Z  ddl!m"Z# ddl$Z%ddl&Z&ddl'm(Z) ddl*m+Z+m,Z,m-Z- dd	l.m/Z/ dd
l&m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z;m<Z<m=Z= ddl>m?Z?m@Z@ ddlAmBZB ddlCmDZDmEZE ddlFmGZGmHZHmIZI ddlJmKZK ddlLmMZMmNZNmOZO ddlPmQZQmRZR ddlSmTZTmUZU ddlVmWZWmXZX ddlYmZZZ ddl[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZb ddlcmdZdmeZemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZu ddlvmwZwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ ddlmZmZmZ ddl"mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ ddlmZ e]gZe_Ze rddlmZ eZe rdd lmZ e rddlZed!d"rddlm  mZ ddlm  mZ e  r.eBd# ddlZdd$lmZ dd%lmZ dd&lmZ dd'lmZ dd(lmZ e rddlm  m&Z ddlm7Z e/e͡e/d)kZdd*lcmZmZmZmZ nd!Ze rddlZe rdd+lmZ e r dd,lmZmZ ddlm7Z dd-lmZmZ e/eۡe/d.krdd/lmZmZmZmZ eI r dd0lmZ er.ddlZeeZd1Zd2Zd3Zd4Zd5Zd6ZG d7d8 d8ZdS )9uc   
The Trainer class, to easily train a 🤗 Transformers from scratch or finetune it on a new task.
    N)Mapping)Path)TYPE_CHECKINGAnyCallableDictListOptionalTupleUnion   )#get_reporting_integration_callbacks	hp_paramsis_fairscale_available)
Repositorycreate_repoupload_folder)version)nn)
DataLoaderDatasetRandomSamplerSequentialSampler)__version__)PretrainedConfig)DataCollatorDataCollatorWithPaddingdefault_data_collator)DebugOptionDebugUnderflowOverflow)dep_version_check)"ALL_HYPERPARAMETER_SEARCH_BACKENDSdefault_hp_search_backend)deepspeed_initdeepspeed_load_checkpointis_deepspeed_available)TrainingSummary)PreTrainedModelload_sharded_checkpointunwrap_model)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMESMODEL_MAPPING_NAMES)	Adafactorget_scheduler)ALL_LAYERNORM_LAYERSis_torch_less_than_1_11)PreTrainedTokenizerBase)CallbackHandlerDefaultFlowCallbackPrinterCallbackProgressCallbackTrainerCallbackTrainerControlTrainerState)DistributedTensorGathererIterableDatasetShardLabelSmootherLengthGroupedSamplerSequentialDistributedSamplerdistributed_broadcast_scalarsdistributed_concatfind_batch_sizeget_dataloader_samplerget_model_param_countget_module_class_from_nameget_parameter_namesnested_concatnested_detachnested_numpifynested_xla_mesh_reducereissue_pt_warningsremove_dummy_checkpoint)PREFIX_CHECKPOINT_DIRBestRunEvalLoopOutputEvalPrediction
FSDPOptionHPSearchBackendHubStrategyIntervalStrategyPredictionOutputRemoveColumnsCollatorShardedDDPOptionTrainerMemoryTrackerTrainOutputdefault_compute_objectivedenumpify_detensorizeenable_full_determinismfind_executable_batch_sizeget_last_checkpoint
has_lengthnumber_of_argumentsseed_workerset_seedspeed_metrics)OptimizerNamesParallelModeTrainingArguments)ADAPTER_CONFIG_NAMEADAPTER_SAFE_WEIGHTS_NAMEADAPTER_WEIGHTS_NAMECONFIG_NAMESAFE_WEIGHTS_INDEX_NAMESAFE_WEIGHTS_NAMEWEIGHTS_INDEX_NAMEWEIGHTS_NAMEPushInProgresscan_return_lossfind_labelsis_accelerate_availableis_apex_availableis_bitsandbytes_availableis_datasets_availableis_in_notebookis_ipex_availableis_peft_availableis_safetensors_availableis_sagemaker_dp_enabledis_sagemaker_mp_enabledis_torch_compile_availableis_torch_neuroncore_availableis_torch_tpu_availablelogging	strtobool)QuantizationMethod)NotebookProgressCallback)ampF)Zcheck_device	fairscale)FullyShardedDataParallel)ShardedDataParallel)	auto_wrap)OSS)ShardedGradScalerz1.10)smp_forward_backwardsmp_forward_only
smp_gathersmp_nested_concat)	PeftModel)Acceleratorskip_first_batches)DistributedDataParallelKwargsGradientAccumulationPlugin0.20.3)load_fsdp_modelload_fsdp_optimizersave_fsdp_modelsave_fsdp_optimizer)DeepSpeedSchedulerWrapperztraining_args.binztrainer_state.jsonzoptimizer.ptzoptimizer.binzscheduler.ptz	scaler.ptc                   @   s  e Zd ZdZddlmZmZmZmZm	Z	 de
eejf eee ee ee
eeeef f  ee eeg ef  eeegef  eee  eejjejjjf eeejejgejf  dddZ d	d
 Z!dd Z"dd Z#dd Z$dd Z%ddee dddZ&deee edddZ'eej(j)j* dddZ+e,dddZ-eeej(j)j* dd d!Z.dee e,dd"d#Z/ee,d$d%d&Z0e1d'd(d)Z2ee dd*d+Z3d,d- Z4e5eee6e6f d.d/d0Z7de1ejjd1d2d3Z8e,e1d4d5d6Z9de,ee1 e1d7d8d9Z:e
d:eee6f f d;d<d=Z;e
d:eee6f f e1eee<f d>d?d@Z=dAdB Z>ddCdDZ?ddFdGZ@dEejAfdHdIZBddKdLZCdee
eeDf  e
d:eee6f f eee  dMdNdOZEddPdQZFdRdS ZGddTdUZHdVdW ZIdXdY ZJdZd[ ZKd\d] ZLdd^d_ZMd`da ZNdeed:geee<f f  eeeee<f ge<f  e1e
eee f ee
ddeOf  eed:gef  e
ePeeP f dedfdgZQeee<f ddhdidjZRe
eje6f e
eje6f dkdldmZSeee
eje6f f eee
eje6f f dndodpZTdqdr ZUdeeD dsdtduZVejeee
eje6f f ejdvdwdxZWddydzZXeDdd{d|ZYeDdd}d~ZZdee eDdddZ[dee dddZ\dee dddZ]dd Z^de_dEfee dddZ`dddddZadee eee  eeee<f dddZbdeeee  eecdddZdde,eeeD eee  eeedddZfdddZgdejeee
eje6f f eDeee  eeej eej eej f dddZheee
eje6f f dddZidd ZjdeDdddZkdee ee e
eee df ee ee e
eee df e
eee df e
eee df e
eee df d	ddZldd Zmdd Zndee eDedddZode,eeeD eee  eeedddZpdd ZqddddZrdd ZsdS )TraineruP  
    Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers.

    Args:
        model ([`PreTrainedModel`] or `torch.nn.Module`, *optional*):
            The model to train, evaluate or use for predictions. If not provided, a `model_init` must be passed.

            <Tip>

            [`Trainer`] is optimized to work with the [`PreTrainedModel`] provided by the library. You can still use
            your own models defined as `torch.nn.Module` as long as they work the same way as the 🤗 Transformers
            models.

            </Tip>

        args ([`TrainingArguments`], *optional*):
            The arguments to tweak for training. Will default to a basic instance of [`TrainingArguments`] with the
            `output_dir` set to a directory named *tmp_trainer* in the current directory if not provided.
        data_collator (`DataCollator`, *optional*):
            The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will
            default to [`default_data_collator`] if no `tokenizer` is provided, an instance of
            [`DataCollatorWithPadding`] otherwise.
        train_dataset (`torch.utils.data.Dataset` or `torch.utils.data.IterableDataset`, *optional*):
            The dataset to use for training. If it is a [`~datasets.Dataset`], columns not accepted by the
            `model.forward()` method are automatically removed.

            Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a
            distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a
            `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will
            manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally
            sets the seed of the RNGs used.
        eval_dataset (Union[`torch.utils.data.Dataset`, Dict[str, `torch.utils.data.Dataset`]), *optional*):
             The dataset to use for evaluation. If it is a [`~datasets.Dataset`], columns not accepted by the
             `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each
             dataset prepending the dictionary key to the metric name.
        tokenizer ([`PreTrainedTokenizerBase`], *optional*):
            The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs to the
            maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an
            interrupted training or reuse the fine-tuned model.
        model_init (`Callable[[], PreTrainedModel]`, *optional*):
            A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start
            from a new instance of the model as given by this function.

            The function may have zero argument, or a single one containing the optuna/Ray Tune/SigOpt trial object, to
            be able to choose different architectures according to hyper parameters (such as layer count, sizes of
            inner layers, dropout probabilities etc).
        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
            The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return
            a dictionary string to metric values.
        callbacks (List of [`TrainerCallback`], *optional*):
            A list of callbacks to customize the training loop. Will add those to the list of default callbacks
            detailed in [here](callback).

            If you want to remove one of the default callbacks used, use the [`Trainer.remove_callback`] method.
        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*): A tuple
            containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your model
            and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
            A function that preprocess the logits right before caching them at each evaluation step. Must take two
            tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
            by this function will be reflected in the predictions received by `compute_metrics`.

            Note that the labels (second parameter) will be `None` if the dataset does not have them.

    Important attributes:

        - **model** -- Always points to the core model. If using a transformers model, it will be a [`PreTrainedModel`]
          subclass.
        - **model_wrapped** -- Always points to the most external model in case one or more other modules wrap the
          original model. This is the model that should be used for the forward pass. For example, under `DeepSpeed`,
          the inner model is wrapped in `DeepSpeed` and then again in `torch.nn.DistributedDataParallel`. If the inner
          model hasn't been wrapped, then `self.model_wrapped` is the same as `self.model`.
        - **is_model_parallel** -- Whether or not a model has been switched to a model parallel mode (different from
          data parallelism, this means some of the model layers are split on different GPUs).
        - **place_model_on_device** -- Whether or not to automatically place the model on the device - it will be set
          to `False` if model parallel or deepspeed is used, or if the default
          `TrainingArguments.place_model_on_device` is overridden to return `False` .
        - **is_in_train** -- Whether or not a model is currently running `train` (e.g. when `evaluate` is called while
          in `train`)

    r   )_get_learning_ratelog_metricsmetrics_formatsave_metrics
save_stateNNN)modelargsdata_collatortrain_dataseteval_dataset	tokenizer
model_initcompute_metrics	callbacks
optimizerspreprocess_logits_for_metricsc                 C   s	  |d kr(d}t d| d t|d}|| _| jjrBt| jjn
t| jj d | _d | _	d| _
|   t| jj| _| j  | }t| |j |d kr|d k	r|| _|  }qtdn|d k	rtdt || _|jjtkrtd|jj d	t|d
r |jr |j r d| _!nd| _!t"|dd d k	rdd t#|j$% D }t&|dkrfd| _!n.t&|dkr| jj't('|d k| _!nd| _!| j!rt d t) ot*|t+}t"|ddot"|dd }|r|stdn|rt"|ddstdd | _,t&|j,dkr| j-r,tdt&|j.dkrDtd|j/t0j1kr\tdnt2 snt3dnrt4j5|j,krt6d krt3dt7j8 dnFt4j5|j,krt4j5| _,n.t4j9|j,krt4j9| _,nt4j:|j,krt4j:| _,d | _.t&|j.dkr| j-rtd|j;d s(|j/t0j1kr(tdt<=t<=t(j8j>t<=d k rPtd!dd"l?m@}mA} tBjC|j.krx|jC| _.n.tBjD|j.kr|jD| _.ntBjE|j.kr|jE| _.|jF| _Gd#| jj;krd$| jj;Hd#g kr|jI| _Gd| _J| jj;Hd%drd| _J|jK| _K| j!sP| j-sP|jLs |jMr(|jNrP| j,t4j9t4j:fksP| j.d k	sP| jOrVd| _K|d krdtPntQ|}|d k	rz|n|| _R|| _S|| _T|| _U| jKrt"|d&d tVjWks| X||j' | j!rd| j_Y|| _Z|| _[|| _\|| _]|
\| _^| __|d k	r| j^d k	s| j_d k	rtd't` r| j^d k	r| j[a D ]}|j'} qLq8| j^jbD ],}t&|d( dkrT|d( d j'} qqT||krtd)| j,d k	s| j-s| j.d k	r| j^d k	s| j_d k	rtd*tctd| jje }|	d kr|n||	 }	tf|	| j[| jU| j^| j_| _g| h| jjir&tjntk d| _ld | _m| jjnrJ| o  | jjprftqjr| jjsdd+ tt| jRsttt"| jRd,d rtd-|judkrt d. |d k	rtv|s|judkrtd/|d k	rt*|t(jwjxjyr|jzrtd0d | _{d| _|d| _}d| _~t r|jr(td1trx|jtjjjkrt d2tjjj d3|j d4tjjj  tjjj|_n(ttjjd5rt d2tjjj d6 |js|jr| j,d k	r|jd7kr|j't('d8kr|jrtd9nd:|_nd;|_t d<|j d= d| _|js$|jr| j-st s| j,d k	r|jd;krd| _}|jr`t(jnt(j| _| jt(jk| _| jrt | _n|jd:krd| _~t(j| _n"|jd>krt st3d?d| _|t r| j}r|jd k	r|jdkrtd@| jjdk	rt| jjdA| _nd | _t|  |  dB| _t | _d| _d | _d| _t| j[j}| jjd k	rj|n| jj| _t| j[j| _| jg| j| j| j| _|j| _d| _| j  |j	rt 	stdCd S )DNZtmp_trainerz1No `TrainingArguments` passed, using `output_dir=z`.
output_dirFz<`Trainer` requires either a `model` or `model_init` argumentz`Trainer` requires either a `model` or `model_init` argument, but not both. `model_init` will overwrite your model when calling the `train` method. This will become a fatal error in the next release.zThe model you have picked (a  ) cannot be used as is for training: it only computes hidden states and does not accept any labels. You should choose a model with a head suitable for your task like any of the `AutoModelForXxx` listed at https://huggingface.co/docs/transformers/model_doc/autois_parallelizableThf_device_mapc                 S   s   g | ]}|d kr|qS ))cpuZdisk ).0devicer   r   U/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/trainer.py
<listcomp>  s      z$Trainer.__init__.<locals>.<listcomp>r   r   zYou have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.Zis_quantizedZ_hf_peft_config_loadedzYou cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft for more detailsZ_is_quantized_training_enabledzThe model you want to train is loaded in 8-bit precision.  if you want to fine-tune an 8-bit model, please make sure that you have installed `bitsandbytes>=0.37.0`. zaUsing --sharded_ddp xxx together with --deepspeed is not possible, deactivate one of those flags.z\Using --sharded_ddp xxx together with --fsdp is not possible, deactivate one of those flags.z5Using sharded DDP only works in distributed training.zASharded DDP training requires fairscale: `pip install fairscale`.zZSharded DDP in a mode other than simple training requires fairscale version >= 0.3, found zD. Upgrade your fairscale library: `pip install --upgrade fairscale`.zZUsing --fsdp xxx together with --deepspeed is not possible, deactivate one of those flags.xlaz.Using fsdp only works in distributed training.z1.12.0zFSDP requires PyTorch >= 1.12.0)BackwardPrefetchShardingStrategybackward_prefetchZbackward_postlimit_all_gathersZquantization_methodzPassing a `model_init` is incompatible with providing the `optimizers` argument. You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method.paramsa[  The model and the optimizer parameters are not on the same device, which probably means you created an optimizer around your model **before** putting on the device and passing it to the `Trainer`. Make sure the lines `import torch_xla.core.xla_model as xm` and `model.to(xm.xla_device())` is performed before the optimizer creation in your script.zPassing `optimizers` is not allowed if Fairscale, Deepspeed or PyTorch FSDP is enabled.You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method.exist_okZcollate_batchzRThe `data_collator` should be a simple callable (function, class with `__call__`).zHmax_steps is given, it will override any value given in num_train_epochszThe train_dataset does not implement __len__, max_steps has to be specified. The number of steps needs to be known in advance for the learning rate scheduler.zTthe `--group_by_length` option is only available for `Dataset`, not `IterableDatasetzOSageMaker Model Parallelism does not support BF16 yet. Please use FP16 instead z(FP16 provided in SM_HP_MP_PARAMETERS is z*,but FP16 provided in trainer argument is z,setting to fp16zJ, but SageMaker Model Parallelism < 1.10 does not support FP16 in trainer.autor   z2Tried to use `fp16` but it is not supported on cpuZcpu_ampZcuda_ampzUsing z half precision backendapexzcUsing FP16 with APEX but APEX is not installed, please refer to https://www.github.com/nvidia/apex.zSageMaker Model Parallelism in mixed precision mode does not support gradient clipping yet. Pass along 'max_grad_norm': 0 in your hyperparameters.)epsilon)is_local_process_zerois_world_process_zeroz3Using torch.compile requires PyTorch 2.0 or higher.)loggerinforc   r   full_determinismrY   seedr_   hp_name	deepspeedis_in_train"create_accelerator_and_postprocessrU   Zskip_memory_metrics_memory_trackerstartZget_process_log_levelr|   set_verbosityZ_setup_devicesr   call_model_initRuntimeErrorwarningswarnFutureWarning	__class____name__r+   
ValueErrorhasattrr   Zmodel_parallelZis_model_parallelgetattrsetr   valueslenr   torchru   
isinstancer   sharded_ddpis_deepspeed_enabledfsdpparallel_moderb   DISTRIBUTEDr   ImportErrorrT   SIMPLEFullyShardedDDPr   r   	ZERO_DP_2	ZERO_DP_3fsdp_configr   parsebase_versionZ2torch.distributed.fsdp.fully_sharded_data_parallelr   r   rN   Z
FULL_SHARDZSHARD_GRAD_OPZNO_SHARDZBACKWARD_PREr   getZBACKWARD_POSTr   place_model_on_devicefp16_full_evalbf16_full_evaldo_trainis_fsdp_enabledr   r   r   r   r   r   r~   ZBITS_AND_BYTES_move_model_to_deviceZ_n_gpumodel_wrappedr   r   r   	optimizerlr_schedulerr{   
parametersZparam_groupsDEFAULT_CALLBACKSr   Z	report_tor1   callback_handleradd_callbackZdisable_tqdmr3   DEFAULT_PROGRESS_CALLBACKZ_loggers_initializedhub_model_idpush_to_hubinit_hf_reposhould_saveosmakedirsr   callable	max_stepsr\   utilsdataIterableDatasetgroup_by_length_signature_columnsuse_apexuse_cuda_ampuse_cpu_amprx   bf16IS_SAGEMAKER_MP_POST_1_10r   smpstatecfgwarningZhalf_precision_backenddo_grad_scalingfloat16bfloat16	amp_dtyper   scalerrp   max_grad_normZlabel_smoothing_factorr:   label_smootherr7   r   r   r6   controlcurrent_floshp_search_backenduse_tune_checkpointsrn   label_namesrm   Zon_init_endtrain_batch_size_train_batch_size_created_lr_schedulerstop_and_update_metricsZtorch_compilery   )selfr   r   r   r   r   r   r   r   r   r   r   r   	log_levelZdevicesZ_is_peft_modelZ_is_quantized_and_base_modelr   r   Zdefault_collatorparamZmodel_deviceZparam_groupZoptimizer_deviceZdefault_callbacksZdefault_label_namesr   r   r   __init__B  s    
 



  



 

 
"


     

 
 "
 


zTrainer.__init__c                 C   s   | j | dS )ac  
        Add a callback to the current list of [`~transformer.TrainerCallback`].

        Args:
           callback (`type` or [`~transformer.TrainerCallback`]):
               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
               first case, will instantiate a member of that class.
        N)r   r   r  callbackr   r   r   r     s    	zTrainer.add_callbackc                 C   s   | j |S )aF  
        Remove a callback from the current list of [`~transformer.TrainerCallback`] and returns it.

        If the callback is not found, returns `None` (and no error is raised).

        Args:
           callback (`type` or [`~transformer.TrainerCallback`]):
               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
               first case, will pop the first member of that class found in the list of callbacks.

        Returns:
            [`~transformer.TrainerCallback`]: The callback removed, if found.
        )r   pop_callbackr   r   r   r   r"    s    zTrainer.pop_callbackc                 C   s   | j | dS )a  
        Remove a callback from the current list of [`~transformer.TrainerCallback`].

        Args:
           callback (`type` or [`~transformer.TrainerCallback`]):
               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
               first case, will remove the first member of that class found in the list of callbacks.
        N)r   remove_callbackr   r   r   r   r#    s    	zTrainer.remove_callbackc                 C   s.   | |}| jjtjkr*t|dr*|  d S )Ntie_weights)tor   r   rb   ZTPUr   r$  )r  r   r   r   r   r   r     s    
zTrainer._move_model_to_devicec                 C   sL   | j d krHt| jj}t|j | _ |  j ttddg| j	 7  _ d S )Nlabel	label_ids)
r  inspect	signaturer   forwardlistr   keysr   r  )r  r)  r   r   r    _set_signature_columns_if_needed  s    
z(Trainer._set_signature_columns_if_neededzdatasets.Dataset)datasetdescriptionc                    s   | j js S |   | j}tt jt| }t|dkr|d krHdn
d| d}t	d| d| j
jj dd| d	d| d
| j
jj d  fdd|D }ttjtdk rވ j jd | jd d  S  |S d S )Nr    zin the z setzThe following columns z) don't have a corresponding argument in `z!.forward` and have been ignored: , z. If z are not expected by `z/.forward`,  you can safely ignore this message.c                    s   g | ]}| j kr|qS r   )column_namesr   kr.  r   r   r      s     
 z2Trainer._remove_unused_columns.<locals>.<listcomp>z1.4.0typeformat_kwargs)r6  columnsr7  )r   remove_unused_columnsr-  r  r+  r   r2  r   r   r   r   r   r   joinr   r   datasetsr   Z
set_formatformatZremove_columns)r  r.  r/  signature_columnsZignored_columnsZdset_descriptionr8  r   r5  r   _remove_unused_columns  s&    :  zTrainer._remove_unused_columns)r   r/  returnc                 C   s6   | j js|S |   | j}t||t|| jjjd}|S )z=Wrap the data collator in a callable removing unused columns.)r   r=  r   r/  
model_name)	r   r9  r-  r  rS   r   r   r   r   )r  r   r/  r=  Zremove_columns_collatorr   r   r   "_get_collator_with_removed_columns
  s    z*Trainer._get_collator_with_removed_columns)r?  c                 C   s   | j d kst| j sd S | jjrt rXt| j tjrX| jj| j j	krR| j | jj nd }nd }| j
d k	rr| j
jd nd }t| jj| jj | j ||dS t| j S d S )Nr   )r.  lengthsmodel_input_name)r   r\   r   r  rr   r   r;  r   Zlength_column_namer2  r   Zmodel_input_namesr;   r  gradient_accumulation_stepsr   )r  rB  rC  r   r   r   _get_train_sampler  s"    zTrainer._get_train_samplerc                 C   s   | j dkrtd| j }| j}t r@t|tjr@| j|dd}n| j|dd}| j	|| j
j| j
jd}t|tjjjs|  |d< | j
j|d< t|d< | jt|f|S )	a@  
        Returns the training [`~torch.utils.data.DataLoader`].

        Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed
        training if necessary) otherwise.

        Subclass and override this method if you want to inject some custom behavior.
        Nz+Trainer: training requires a train_dataset.trainingr/  
batch_sizeZ
collate_fnZnum_workersZ
pin_memorysampler	drop_lastZworker_init_fn)r   r   r   rr   r   r;  r   r>  rA  r  r   dataloader_num_workersdataloader_pin_memoryr   r   r   r   rE  dataloader_drop_lastr^   acceleratorpreparer   )r  r   r   dataloader_paramsr   r   r   get_train_dataloader5  s"    	
zTrainer.get_train_dataloader)r   r?  c                 C   sj   | j jrNt r$t|t t dS t rFt|t	 t
 | j jdS t|S | j jdkrbt|S d S d S )N)num_replicasrank)rS  rT  rI  r   )r   use_legacy_prediction_loopr{   r<   xmxrt_world_sizeZget_ordinalrx   r  Zdp_sizeZdp_rankZper_device_eval_batch_sizer   
world_size)r  r   r   r   r   _get_eval_samplerV  s$      zTrainer._get_eval_samplerc                 C   s   |dkr| j dkrtd|dk	r&|n| j }| j}t rTt|tjrT| j|dd}n| j|dd}| j	j
|| j	j| j	jd}t|tjjjs| ||d< | j	j|d< | jt|f|S )a  
        Returns the evaluation [`~torch.utils.data.DataLoader`].

        Subclass and override this method if you want to inject some custom behavior.

        Args:
            eval_dataset (`torch.utils.data.Dataset`, *optional*):
                If provided, will override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted
                by the `model.forward()` method are automatically removed. It must implement `__len__`.
        Nz-Trainer: evaluation requires an eval_dataset.Z
evaluationrG  rH  rJ  rK  )r   r   r   rr   r   r;  r   r>  rA  r   eval_batch_sizerL  rM  r   r   r   r   rY  rN  rO  rP  r   )r  r   r   rQ  r   r   r   get_eval_dataloaderl  s     zTrainer.get_eval_dataloader)test_datasetr?  c                 C   s   | j }t r(t|tjr(| j|dd}n| j|dd}| jj|| jj	| jj
d}t|tjjjsz| ||d< | jj|d< | jt|f|S )a  
        Returns the test [`~torch.utils.data.DataLoader`].

        Subclass and override this method if you want to inject some custom behavior.

        Args:
            test_dataset (`torch.utils.data.Dataset`, *optional*):
                The test dataset to use. If it is a [`~datasets.Dataset`], columns not accepted by the
                `model.forward()` method are automatically removed. It must implement `__len__`.
        testrG  rH  rJ  rK  )r   rr   r   r;  r   r>  rA  r   rZ  rL  rM  r   r   r   r   rY  rN  rO  rP  r   )r  r\  r   rQ  r   r   r   get_test_dataloader  s    zTrainer.get_test_dataloadernum_training_stepsc                 C   s8   |    tr tjjjr | jj}n| j}| j||d dS )aZ  
        Setup the optimizer and the learning rate scheduler.

        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
        Trainer's init through `optimizers`, or subclass and override this method (or `create_optimizer` and/or
        `create_scheduler`) in a subclass.
        r`  r   N)create_optimizerr  r  r	  r
  r   r   create_schedulerr  r`  r   r   r   r   create_optimizer_and_scheduler  s
    
z&Trainer.create_optimizer_and_schedulerc                 C   s   t |t}dd |D }|S )a!  
        Get all parameter names that weight decay will be applied to

        Note that some models implement their own layernorm instead of calling nn.LayerNorm, weight decay could still
        apply to those modules since this function only filter out instance of nn.LayerNorm
        c                 S   s   g | ]}d |kr|qS )Zbiasr   r   namer   r   r   r     s      z5Trainer.get_decay_parameter_names.<locals>.<listcomp>)rC   r.   )r  r   decay_parametersr   r   r   get_decay_parameter_names  s    
z!Trainer.get_decay_parameter_namesc           	         st  t  r| jn| j}| jdkrX| |  fdd| D | jjd fdd| D ddg}t	| j\}}| j
tjkrtf ||d|| _n||f|| _|jdkrXd	dl}|jj }d	}| D ]n}t|tjr|td
d | D  7 }td| d|d  d ||dddi td| d qtd|d  d t  rnt| j| _| jS )a   
        Setup the optimizer.

        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
        Nc                    s"   g | ]\}}| kr|j r|qS r   Zrequires_gradr   nprh  r   r   r     s      z,Trainer.create_optimizer.<locals>.<listcomp>)r   weight_decayc                    s"   g | ]\}}| kr|j r|qS r   rj  rk  rn  r   r   r     s              )r   optimZAdam8bitr   c                 S   s   i | ]}|  | qS r   )Zdata_ptrnumel)r   rm  r   r   r   
<dictcomp>  s      z,Trainer.create_optimizer.<locals>.<dictcomp>zskipped z: i   zM paramsweight
optim_bits    zbitsandbytes: will optimize z in fp32z	skipped: ) rx   r   r   r   ri  Znamed_parametersr   ro  r   get_optimizer_cls_and_kwargsr   rT   r   r   r   bitsandbytesrq  ZGlobalOptimManagerZget_instancemodulesr   r   Z	Embeddingsumr   r   r   r   Zregister_module_overridedebugr  ZDistributedOptimizer)	r  Z	opt_modelZoptimizer_grouped_parametersoptimizer_clsoptimizer_kwargsrx  managerZskippedmoduler   rn  r   rb    sJ    



zTrainer.create_optimizer)r   r?  c                 C   s  i }| j r:| j dddD ]}|d\}}|||< qd| ji}| j| jf| jd}| jtj	kr|t
}|ddd n| jtjkrd	d
lm} |}|| n| jtjtjfkrdd
lm} |}|| | jtjkr|ddi n| jtjkr@zdd
lm} |}|| W n tk
r:   tdY nX nR| jtjkrzddlm}	 |	}|| W n tk
r   tdY nX n| jtjtjtjtjtjtjtjtj fkrzddl!m}m"}
 d}d}d}|}d| jkrd}d| jkrd}d| jkr|}n d| jkr0|
}d| j| jfi}||d}|| || W n tk
rn   tdY nX t# rt$%t&j'$dt$%dk rt()d n| jtj*kr:zhddl+m,} |}|| |t-|.d d!t/t0|.d"d#t/t0|.d$d#t/t0|.d%d&d' W n tk
r6   td(Y nX nX| jtj1krRt0jj1}n@| jtj2krjt0jj3}n(| jtj4krt0jj5}ntd)| j ||fS )*z
        Returns the optimizer class and optimizer parameters based on the training arguments.

        Args:
            args (`transformers.training_args.TrainingArguments`):
                The training arguments for the training session.

         r0  ,=lr)betasZepsF)Zscale_parameterZrelative_stepr   )AdamWr   ZfusedTz7Trainer failed to import syncfree AdamW from torch_xla.)	FusedAdamzFTrainer tried to instantiate apex FusedAdam but apex is not installed!)r  Lionrv  NZpaged8bit   ZadamZlionr  )is_pagedru  zDTrainer tried to instantiate bnb optimizer but bnb is not installed!rx  z0.41.1zYou are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.)AnyPrecisionAdamWuse_kahan_summationFalsemomentum_dtypefloat32variance_dtypecompensation_buffer_dtyper  )r  r  r  r  z4Please install https://github.com/pytorch/torchdistxz2Trainer cannot instantiate unsupported optimizer: )6
optim_argsreplacesplitlearning_rateZ
adam_beta1Z
adam_beta2Zadam_epsilonrq  ra   Z	ADAFACTORr,   updateZADAMW_HFoptimizationr  ZADAMW_TORCHZADAMW_TORCH_FUSEDZtorch.optimZADAMW_TORCH_XLAZtorch_xla.amp.syncfreer   r   ZADAMW_APEX_FUSEDZapex.optimizersr  Z	ADAMW_BNBZ
ADAMW_8BITZPAGED_ADAMWZPAGED_ADAMW_8BITZLIONZ	LION_8BITZ
PAGED_LIONZPAGED_LION_8BITZbitsandbytes.optimr  rq   r   r   	importlibmetadatar   r  ZADAMW_ANYPRECISIONZtorchdistx.optimizersr  r}   r   r   r   ZSGDZADAGRADZAdagradZRMSPROPZRMSprop)r   r  mappingkeyvaluer}  Zadam_kwargsr|  r  r  r  r  ru  Zadditional_optim_kwargsZ
bnb_kwargsr  r   r   r   rw     s    








 




z$Trainer.get_optimizer_cls_and_kwargsra  c                 C   sB   | j dkr<t| jj|dkr | jn|| j||d| _ d| _| j S )z
        Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
        passed as an argument.

        Args:
            num_training_steps (int): The number of training steps to do.
        N)r   Znum_warmup_stepsr`  T)r   r-   r   Zlr_scheduler_typer   Zget_warmup_stepsr  rd  r   r   r   rc  |  s    

zTrainer.create_scheduler)
dataloaderr?  c              
   C   sZ   z*|j }t|tr t|j j W S t|j W S  tttfk
rT   t|| jj  Y S X dS )z
        Helper to get number of samples in a [`~torch.utils.data.DataLoader`] by accessing its dataset. When
        dataloader.dataset does not exist or has no length, estimates as best it can
        N)	r.  r   r9   r   	NameErrorAttributeError	TypeErrorr   per_device_train_batch_size)r  r  r.  r   r   r   num_examples  s    
zTrainer.num_examples)train_dlr   r?  c                 C   sn   d}zBt |D ]2\}}|d  }|dk	r8||   W S ||7 }q|W S  tk
rh   td | Y S X dS )zq
        Helper to get number of tokens in a [`~torch.utils.data.DataLoader`] by enumerating dataloader.
        r   	input_idsNz%Cannot get num_tokens from dataloader)	enumeraterr  KeyErrorr   r  )r  r  r   Ztrain_tokensstepbatchtokensr   r   r   
num_tokens  s    

zTrainer.num_tokenszoptuna.Trialtrialc                 C   s  || _ | jdks|dkrdS | jtjkr4| |}nP| jtjkrR|}|dd n2| jtjkrtdd |j	 D }n| jtj
kr|}|	 D ]X\}}t| j|std| d qt| j|d}|dk	rt||}t| j|| q| jtjkrtd|j  | jtjkr&td|j  | jtj
krDtd	|  | jr| jjdkrbtd
ddlm} ddlm} || jj| j_| jj| j || jjd| j_|   dS )zHP search setup codeNwandbc                 S   s(   i | ] \}}|t |tr t|n|qS r   )r   strintr   r4  vr   r   r   rs    s      z,Trainer._hp_search_setup.<locals>.<dictcomp>zTrying to set zY in the hyperparameter search but there is no corresponding field in `TrainingArguments`.zTrial: zSigOpt Assignments: zW&B Sweep parameters: z7For sweeps with deepspeed, `args.deepspeed` must be setr   )DeepSpeedPluginHfTrainerDeepSpeedConfig)hf_ds_config)_trialr  rO   OPTUNAhp_spaceRAYpopSIGOPTassignmentsitemsWANDBr   r   r   r  r   r6  setattrr   r   r   r   r   accelerate.utilsr  #transformers.integrations.deepspeedr  hf_deepspeed_configtrainer_config_processdeepspeed_pluginr   )r  r  r   r  r  old_attrr  r  r   r   r   _hp_search_setup  sJ    
zTrainer._hp_search_setup)r  r  metricsc                 C   s   | j d ks|d krd S | | | _| j tjkrzdd l}|j s|	| j| |
 r| j| j| j| j | n>| j tjkrddlm} | jjr|   |j	f d| ji| d S )Nr   tune	objective)r  compute_objectivecopyr  rO   r  optunaZstudyZ_is_multi_objectivereportZshould_pruner   on_train_endr   r	  r  ZTrialPrunedr  rayr  r   _tune_save_checkpoint)r  r  r  r  r  r  r   r   r   _report_to_hp_search  s    

zTrainer._report_to_hp_searchc              	   C   s   ddl m} | jsd S |j| jjd}tj|t	 d| jj }| j
|dd | jjr| jtj|t t| j tj|t t| j tj|t W 5 Q R X d S )Nr   r  )r  -T_internal_call)r  r  r  checkpoint_dirr	  global_stepr   pathr:  rJ   
save_modelr   r   save_to_jsonTRAINER_STATE_NAMEr   saver   
state_dictOPTIMIZER_NAMEr   SCHEDULER_NAME)r  r  r  r   r   r   r   r    s    zTrainer._tune_save_checkpointc                 C   sL   t | j}|dkr|  }n|dkr0| |}ntd|d krHtd|S )Nr   r   z'model_init should have 0 or 1 argument.z"model_init should not return None.)r]   r   r   )r  r  Zmodel_init_argcountr   r   r   r   r     s    

zTrainer.call_model_initFc           
         s  |s|d krt d |S tt| |   z>t|}|  |jdd }|r`||_	| j
jdd t  tttjjtdkrt trtjj| dd}n tjj| fdd D dd}n@g } D ]}t | }|| qt|}tjj||dd	}W 5 Q R X W 5 Q R X tj|}t  |f   |f   W 5 Q R X |}d| _d| _W n@ ttttt fk
r }	 zt d
|	 d W 5 d }	~	X Y nX |S )NzAfailed to use PyTorch jit mode due to current dataloader is none.Z_original_forwardFcache_enabledz2.0.0)Zexample_kwarg_inputsstrictc                    s   i | ]}| | qS r   r   )r   r  Zexample_batchr   r   rs     s      z0Trainer.torch_jit_model_eval.<locals>.<dictcomp>r  z'failed to use PyTorch jit mode due to: .)!r   r  nextiter_prepare_inputsr  eval__dict__r  r*  rO  autocastr   no_gradr   r   r   r   r   dictZjittraceZ	ones_likeappendtuplefreezer  r  r   r  r   r  
IndexError)
r  r   r  rF  Z	jit_modelZoriginal_forwardZ
jit_inputsr  Zexample_tensorer   r  r   torch_jit_model_eval  sJ    



&


$zTrainer.torch_jit_model_evalc                 C   s   t  stddd l}|sT|  | js6| jjr6tjn|}|j	||dd| j d}n*|j
sb|  |j	||| jddd\}| _|S )NzUsing IPEX but IPEX is not installed or IPEX's version does not match current PyTorch, please refer to https://github.com/intel/intel-extension-for-pytorch.r   ZO1F)dtypelevelZconv_bn_foldinginplaceT)r  r   r  r  )rt   r   Zintel_extension_for_pytorchr  r   r   r   r   r  optimizerF  trainr   )r  r   rF  r  Zipexr   r   r   ipex_optimize_model6  s&        zTrainer.ipex_optimize_modelTc                    s  | j jr*| jrtjntj}| j|||d}t rXt| j	t
jjrF| j	S t
j|| j jdS t||k	rh|S | jr|rtj|| j| j jd\}| _| j jdkrt|ddst|}| j jrt }| |||}tt | d| _|s|S | jd k	rz| jtjkrt || j}nd| j j!p"| j j"}tj#| j jk}| jtj$k}tj%| j jkrVt&|}t'||||d(| j j) | _}n"| j*d k	r| j j+d	 rz,d
dl,m-  d
dl,m. d
dl/m0}	m1}
 W n t2k
r   t2dY nX d }d }t|dd }| j j+3d|}| j j+d d
kr.t4j5|	| j j+d d}nR|d k	rt6 }|D ].}t7||}|d krdt8dn
|9| qBt4j5|
|d}| j j:}| j j+d r fdd} |f||d| | _}di fdd}|t;_<nt= rtj>j?|t@tABdgd}n| j jCtDjEkrtF r|S i }| j jGd k	r8| j jG|d< n"t|tHrR|jI |d< nd|d< | j jJd k	rt| j jJ|d< | j jKd k	r| j jK|d < tLf || jM_N|S )!N)r  )Zbackward_passes_per_step)	opt_levelr   Zis_loaded_in_8bitF   )mixed_precisionZreshard_after_forwardcpu_offloadr   r   )XlaFullyShardedDataParallel)checkpoint_module)size_based_auto_wrap_policytransformer_auto_wrap_policyzJMissing XLA FSDP related module; please make sure to use torch-xla >= 2.0.Z_no_split_modulesZtransformer_layer_cls_to_wrapmin_num_params)r  z@Could not find the transformer layer class to wrap in the model.)Ztransformer_layer_clsZxla_fsdp_grad_ckptc                    s    | f||S Nr   )mr   kwargsZFSDPr   r   r   auto_wrapper_callable  s    z2Trainer._wrap_model.<locals>.auto_wrapper_callable)auto_wrap_policyr  c                 S   s   | j f |}|rt  |S r  )r  rV  	mark_step)r   barrierZoptimizer_argslossr   r   r   patched_optimizer_step  s    z3Trainer._wrap_model.<locals>.patched_optimizer_stepZSMDATAPARALLEL_LOCAL_RANK)Z
device_idsZfind_unused_parametersTZbucket_cap_mbZbroadcast_buffers)Or   Zuse_ipexr  r   r  r  r  rx   r   r   r  r   ZDistributedModelrD  r)   r  r   Z
initializer   Zfp16_opt_leveln_gpur   r   ZDataParallelZjit_mode_evaltimer  roundjit_compilation_timer   rT   r   
ShardedDDPr   r  ZOFFLOADr   Z	AUTO_WRAPr   r   r%  r   r   r   Ztorch_xla.distributed.fsdpr  r   Ztorch_xla.distributed.fsdp.wrapr  r  r   r   	functoolspartialr   rB   	ExceptionaddZxla_fsdp_configrV  Zoptimizer_steprw   parallelZDistributedDataParallelr  r   getenvr   rb   r   rz   Zddp_find_unused_parametersr'   Zis_gradient_checkpointingZddp_bucket_cap_mbZddp_broadcast_buffersr   rO  Zddp_handler)r  r   rF  r  r  
start_timer  r  Zzero_3r  r  r	  r  Z%default_transformer_cls_names_to_wrapZ"fsdp_transformer_layer_cls_to_wrapZtransformer_cls_to_wrapZlayer_classZtransformer_clsZfsdp_kwargsr  r  r   r  r   _wrap_modelM  s    

  




	 zTrainer._wrap_model)resume_from_checkpointr  ignore_keys_for_evalc                 K   s  |dkrd}| j   | j}d| _|js.|jrD|jsD| | j|j	 d|krb|
d}tdt t|dkrtddt|  d	| | | jj| _d}| jdk	r| jjrt| jjn
t| jj | || _d}d
\| _| _t|tr"|r"t |j!}|dkr"t"d|j! d|dk	rNt# sN| j$sN| j%sN| &| |rt| j'rl| | j|j	 | j| _(t)| j*| j|j+}|j,rzt-/  |||||dW S t-.  X n|||||dS dS )a  
        Main training entry point.

        Args:
            resume_from_checkpoint (`str` or `bool`, *optional*):
                If a `str`, local path to a saved checkpoint as saved by a previous instance of [`Trainer`]. If a
                `bool` and equals `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance
                of [`Trainer`]. If present, training will resume from the model/optimizer/scheduler states loaded here.
            trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
                The trial run or the hyperparameter dictionary for hyperparameter search.
            ignore_keys_for_eval (`List[str]`, *optional*)
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions for evaluation during the training.
            kwargs (`Dict[str, Any]`, *optional*):
                Additional keyword arguments used to hide deprecated arguments
        FNTZ
model_pathzi`model_path` is deprecated and will be removed in a future version. Use `resume_from_checkpoint` instead.r   z3train() received got unexpected keyword arguments: r1  r  r   z/No valid checkpoint found in output directory ())r   r  r  r  )0r   r   r   r   r   r   r   r   r   r   r  r   r   r   r   r  r:  r+  r,  r  r  r  r   r   rY   r   r_   r   r   r   r   boolr[   r   r   rx   r   r   _load_from_checkpointr   r   rZ   _inner_training_loopZauto_find_batch_sizer   hf_hub_utilsZenable_progress_barsZdisable_progress_bars)r  r  r  r  r  r   Zmodel_reloadedZinner_training_loopr   r   r   r    sz    




 


  
zTrainer.trainc           /      C   s  | j   || _td| j  |  }| j|j |j }d }d }	t|rt	|}||j }
t
|
d}
| |}|jdkr|j}|j|
 t|j|
 dk }|j| }|jr| ||j|j }	nDt|j|
 }t|j}| ||j }|jrj| ||j }	n^|jdkrZ|j}tj}|}
||j }|j| }|jrj| ||j|j }	ntd|j tj| jjkr| jjdkrtdn
t| j}| jd k	r| jtjkpt p| j d k	p| j!}| j"rd | _#d| _"| j$r t%| |d\| _&| _#|s| j'|d t( | _)|d k	| j)_*|j+d k	r^|j+dk rTt||j+ | j)_+n
|j+| j)_+|j,d k	r|j,dk rt||j, | j)_,n
|j,| j)_,|j-d k	r|j-dk rt||j- | j)_-n
|j-| j)_-|j.r| j/  | 0| j1}|| jkrdnd}|r(|r| j 2| j| _| j'|d |r| j3  t4| j#d	rx| j5r^| j 2| j}n| j 2| j| j&\}| _&n | j 2| j| j&| j#\}| _&| _#| j!r| | _| _1|| jk	r|| _1| j$r| j1| _6|d k	r| j$rt7| j1| nt s| j!r| 8|| j1 | 9| t:d
 t:d|d t:d|d t:d| jj;d | jj;| jkr~t:d| jd t:d|d t:d|j  t:d|d t:dt<|ddd d| j)_=t>> }d}d}d }|d k	rt?j@At?j@B|tCrt(Dt?j@B|tC| _)| j)jE|
 }|jFsN| j)jE|
 }||j9 }nd}t:d t:d|  t:d| j)jE  |jFst:d| d| d | j| jG_| j&| jG_&| j#| jG_#|| jG_H| jId k	r| jJd k	r| I| jJ| j)_K|d k	r| jLtMjNkr|jOn|}tP|| j)_Qnd | j)_Q|| j)_|| j)_| R | j)_R| S | j)_StTUdV|jW}d| _X| j)jE| _Y|Z  | jG[|| j)| j\| _\|jFst]|D ]R}t^|}t_|t`}tas|s|D ]} qqn|d k	r|ng }tb|}qd}t]||D ]}|}|jcdkrd | _d|d k	r.t	|n
|j|j }| jGe|| j)| j\| _\||krx|d k	rx|dkrx| f| d} d}!|dkrtg||}|}!d}d} d}"th|D ]\}"}#|d7 }| r| f| d} |dkr|d8 }|d k	r|id |dkr| f| qn|d k	r(|j  d }|"|j dkrN| jGk|| j)| j\| _\| j l| | m||#}$W 5 Q R X |jnrto stTp|$stTq|$r||d| j)jE | jY  7 }n||$7 }|  jrts| t|#7  _r||jko|"d |k}%||j dk	s|%r |%	s tuvtwtuvdk	r.| j jxyd |jzd k	
r|jzdk
r| j{	rto 	rzt|}| j&}&t|j~d|&dt|  d  | j| j& t 	r|j	r| j&|jz nnt4| j&d!	r| j&|jz nPt4|d"	r||jz n6| j5
rtjt| j&|jz n| j | |jz d}'to 
rP| j{
rD| j| j& | ji  n
| j&  nR| j{
r| j }(| j| j& | ji  | j })|(|)k}'n| j&  | j j }'|'
rt_| j#tTjj#j
s| j#  |Z  | j) jEd7  _E||"d |! |  | j)_=| jG|| j)| j\| _\| ||||| n| jG|| j)| j\| _\| j\jsJ| j\jr qTq|"dk rtd#| j)jE d$| d% d| j\_| jG|| j)| j\| _\| ||||| tj| jjkrto rt|t  n
td& | j\jr  qq |jcrt4| d'rt| d' t:d( |jrr| j)jd k	rrto rBt|d) n(|jtjkrZt  nt rjt  |   |  jX| 7  _X| jX| j)jE }*td*||| j)j|	d+}+|   | j)j|+d,< |*|+d-< d| _| j|+ | |+ | |},| jd|,d.}-| jjrZ| j)jd k	rZ| jjdkrZ|-D ]6}.t?j@|.| j)js"t:d/|. d0 t|. q"| jG|| j)| j\| _\|   t| j)jE|*|+S )1Nz)Currently training with a batch size of: r   r   zYargs.max_steps must be set to a positive value if dataloader does not have a length, was zjCurrently --debug underflow_overflow is not supported under DP. Please use DDP (torch.distributed.launch).Fr_  Tr  z***** Running training *****  Num examples = r  z  Num Epochs = z(  Instantaneous batch size per device = zA  Training with DataParallel so batch size has been adjusted to: zE  Total train batch size (w. parallel, distributed & accumulation) = z   Gradient Accumulation steps = z  Total optimization steps = z#  Number of trainable parameters = )Ztrainable_onlyzE  Continuing training from checkpoint, will skip to saved global_stepz!  Continuing training from epoch z'  Continuing training from global step z  Will skip the first z epochs then the first z batches in the first epoch.rp  r   rz  g      ?)scaleclip_grad_normclip_grad_norm_zXThere seems to be not a single sample in your epoch_iterator, stopping training at step zI! This is expected if you're using an IterableDataset and set num_steps (z.) higher than the number of available samples.zYou enabled PyTorch/XLA debug metrics but you don't have a TPU configured. Check your training configuration if this is unexpected._pastzU

Training completed. Do not forget to share your model on huggingface.co/models =)

load_best_model_at_endr  )num_samples	num_stepsr  
total_flos
train_loss	use_mtimer   Deleting older checkpoint [] due to args.save_total_limit)rO  Zfree_memoryr  r   r{  rR  rD  rX  r\   r   maxr  r   r  Zinclude_tokens_per_secondr  mathceilnum_train_epochssysmaxsizer   r   ZUNDERFLOW_OVERFLOWr   r  r   r   r   rT   r   rx   r   r   r  r   r   r#   r   re  r7   r	  Zis_hyper_param_searchZlogging_stepsZ
eval_stepsZ
save_stepsgradient_checkpointingZgradient_checkpointing_enabler  r   rP  r  r   r  r   r$   r  _load_optimizer_and_schedulerr   r  rA   epochr  r   r  isfiler:  r  Zload_from_jsonr  Zignore_data_skipr   train_dataloaderr   r  Z
trial_namer  rO   r  r  r   Ztrial_paramsr   r   r   Ztensorr%  r   _total_loss_scalar_globalstep_last_loggedZ	zero_gradZon_train_beginr  ranger@   r   r   r/   r+  
past_indexr'  Zon_epoch_begin_load_rng_stater   r  r  closeZon_step_begin
accumulatetraining_stepZlogging_nan_inf_filterr{   isnanisinfr  floatfloating_point_opsr   r   accelerate_versionZgradient_stateZ_set_sync_gradientsr  r  rV  Z_fetch_gradientsZ
all_reducerW  r  Zunscale_r   Zclip_master_gradsr%  r&  r   r   r   Zmaster_paramsr   r  Z	get_scaleZoptimizer_step_was_skippedrq  ReduceLROnPlateauZon_step_end_maybe_log_save_evaluateZon_substep_endZshould_epoch_stopZshould_training_stopr  Zon_epoch_endTPU_METRICS_DEBUGmaster_printmetmetrics_reportdelattrr(  best_model_checkpoint
rendezvousr   rb   r   distr  r  _load_best_modelitemr`   
store_flosr+  r   r   r  log_get_output_dir_sorted_checkpointsr   save_total_limitsamefileshutilrmtreer  _finish_current_pushrV   )/r  rI  r   r  r  r  r;  Ztotal_train_batch_sizeZlen_dataloaderZnum_train_tokensZnum_update_steps_per_epochr  r   r4  Znum_train_samplesZdebug_overflowZdelay_optimizer_creationr   Zuse_accelerator_preparer  Zepochs_trainedZsteps_trained_in_current_epochZsteps_trained_progress_barr  tr_lossr9  rJ  Zis_random_sampler_Ztotal_batched_samplesZepoch_iteratorZsteps_in_epochZrng_to_syncZsteps_skippedr  inputsZtr_loss_stepZ)is_last_step_and_steps_less_than_grad_accZ	gradientsZoptimizer_was_runZscale_beforeZscale_afterr,  r  run_dircheckpoints_sorted
checkpointr   r   r   r   >  s   












	




  





































&zTrainer._inner_training_loopc                 C   s   | j d k	r|d k	r| j tjkr&|j}nR| j tjkrHddlm} | }n0| j tjkr\|j	}n| j tj
krxdd l}|jj	}| jd k	r| |nd| }tj| jj|}n| jj}|S )Nr   r  zrun-)r  rO   r  numberr  r  r  Zget_trial_idr  idr  r  runr   r   r  r:  r   r   )r  r  Zrun_idr  r  Zrun_namera  r   r   r   rW    s    
zTrainer._get_output_dirc                    s  |d kr| j }tj t}tj t}tj t}tj t}tj t}tj t	}tj t
}	tj ot fddt D }
|
r| jstd  dtdd ||||	||fD s|
std  td  d tj|r>t|}|j}|d k	r>|tkr>td	| d
t d tj|s`tj|s`|
rTt rtjtj drtj tddd nLt| jdr| jjdkrtd tj|dd}d|d< |j |dd}~np| jrt!| j"j#j$| j"|  nP| jj%r,tj|r,t&jj'|dd}ntj|dd}| |d}~| (| nt) rt*|t+rt|drt|drtj, r|j- |j.dd ntdt d n
td n(t/| t | jj%d}t s| (| d S )Nc                 3   s6   | ].}t jt j |rtd d |kV  qdS r  r   N)r   r  isdirr:  rk   r  r   Zfolder_namer  r   r   	<genexpr>2  s   z0Trainer._load_from_checkpoint.<locals>.<genexpr>zCheckpoint found at z* is only supported when using PyTorch FSDPc                 s   s   | ]}t j|V  qd S r  )r   r  r:  )r   fr   r   r   rk  <  s   z!Can't find a valid checkpoint at zLoading model from r  z9You are resuming training from a checkpoint trained with z- of Transformers but your current version is zJ. This is not recommended and could yield to errors or unwanted behaviors.user_content.ptFr  tagr  Zload_optimizerr   TzPEnabling FP16 and loading from smp < 1.10 checkpoint together is not suppported.r   map_location_smp_is_partialr  r   active_adapterload_adapter)Zis_trainablejThe intermediate checkpoints of PEFT may not be saved correctly, consider using a custom callback to save i in corresponding saving folders. Check some examples here: https://github.com/huggingface/peft/issues/96GCould not load adapter model, make sure to have `peft>=0.3.0` installed)r  Zprefer_safe)0r   r   r  r:  rg   rf   re   rk   rj   ri   rh   rh  anylistdirr   r   r   r   r:  r   Zfrom_json_fileZtransformers_versionr   r  rx   r  r  r   r   r   r   loadload_state_dictr   rO  r	  fsdp_pluginsave_safetensorssafetensors	load_file_issue_warnings_after_loadru   r   r   existsru  rt  r(   )r  r  r   config_fileZadapter_weights_fileZadapter_safe_weights_fileZweights_fileZweights_index_fileZsafe_weights_fileZsafe_weights_index_fileZis_fsdp_ckptconfigZcheckpoint_versionr  load_resultr   rj  r   r  '  s    

"   
   zTrainer._load_from_checkpointc           
      C   s  t d| jj d| jj d tj| jjt}tj| jjt	}tj| jjt
}tj| jjt}t rt| jn| j}| jrt| j| jj n4| jrt| jjj| j|| jj}ntj|stj|stj|stj|rrd}t rztjtj| jjdr*tj| jjtddd nN| jjrTtj|rTtjj|dd	}ntj|dd
}d|d< |j|dd}nt  rt!|t"rt#|drt#|drtj|stj|r|$| jj|j% ddl&m'}	 |	g g }nt (dt
 d d}nt (d d}nD| jjr>tj|r>tjj|dd	}ntj|dd
}||d}t s|r| )| nVtjtj| jjt*rt+|| jjt d}t s| )| nt (d| d d S )NzLoading best model from z	 (score: z).Trm  Frn  r   rs  rp  rr  r  rt  ru  r   )_IncompatibleKeysrv  rw  rx  z#Could not locate the best model at zi, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.),r   r   r	  rP  best_metricr   r  r:  rk   ri   rf   re   rx   r   r   r   r$   r   r   rO  r}  r  r:  r  r  r   r~  r  r   r  r{  r|  ru   r   r   r   ru  rt  Ztorch.nn.modules.moduler  r  r  rj   r(   )
r  Zbest_model_pathZbest_safe_model_pathZbest_adapter_model_pathZbest_safe_adapter_model_pathr   r  Zhas_been_loadedr  r  r   r   r   rS    s        



	

  
zTrainer._load_best_modelc                 C   sv   t |jdkrP| jjd k	r<t|jt| jjkr<| j  ntd|j d t |jdkrrtd|j d d S )Nr   z8There were missing keys in the checkpoint model loaded: r  z;There were unexpected keys in the checkpoint model loaded: )	r   Zmissing_keysr   Z_keys_to_ignore_on_saver   r$  r   r  Zunexpected_keys)r  r  r   r   r   r    s    z"Trainer._issue_warnings_after_loadc                 C   sp  | j jrt rt  i }| |  }||8 }t|| j	j
| j  d|d< |  |d< |  j|7  _| j	j
| _|   | | d }| j jr:t| jtri }| j D ](\}	}
| j|
|d|	 d}|| qn| j|d}| || j	j
| t| jtjjjr:| jj}|ds*d| }| j||  | j j rl| j!|||d | j"#| j| j	| j | _ d S )Nr  r  r  eval_)r   ignore_keysmetric_key_prefixr  )r  )$r  Z
should_logr{   rV  r
  _nested_gathermeanrT  r  r	  r  r=  r   r<  rU  rV  Zshould_evaluater   r   r  r  evaluater  r  r   r   rq  rI  r   metric_for_best_model
startswithr  r   _save_checkpointr   Zon_save)r  r^  r   r  r9  r  logsZtr_loss_scalarr  Zeval_dataset_namer   Zdataset_metricsmetric_to_checkr   r   r   rJ    sB    




z Trainer._maybe_log_save_evaluatec              
   C   sP  |d krd S | j jdkrZ| j j}tj|d| d}tj|std| d d S n(tj|d}tj|std d S t	
|}t|d  tj|d	  t	j|d
  t	j r6| j jtjkrt	jj|d  nNzt	jj|d  W n6 tk
r4 } ztd| d W 5 d }~X Y nX t rLt|d  d S )Nr   
rng_state_.pthz$Didn't find an RNG file for process zr, if you are resuming a training that wasn't launched in a distributed fashion, reproducibility is not guaranteed.rng_state.pthzDidn't find an RNG file, if you are resuming a training that was launched in a distributed fashion, reproducibility is not guaranteed.pythonnumpyr   cudazUDidn't manage to set back the RNG states of the GPU because of the following error:
 zO
This won't yield the same results as if the training had not been interrupted.r   )r   rX  process_indexr   r  r:  r:  r   r   r   r{  randomsetstatenpZ	set_stateZset_rng_stater  is_availabler   rb   r   Zset_rng_state_allr  r{   rV  )r  rc  r  Zrng_fileZcheckpoint_rng_stater  r   r   r   r@  &	  s>    


zTrainer._load_rng_statec              	   C   s  t  d| jj }| jd kr,|d kr,|   | j|d}tj||}| j	|dd | j
rf| j| | jtjkr|| j  | js| jr| jrt| jjj| j| j| j| n*| jj| j| j}t|tj|t t rBtd t| j  tj|t t!j"dd*}t| j#  tj|t$ t%| W 5 Q R X nt& r| jj'dd}	t()  t(* d	kszt(jj+j,rt(j|	tj|tdt(jj+j,d
 n>| j-j.r| j
s| js| jst| j  tj|t | j
ot/| j#t0 }
| j-j.rr| j
r
|
rrt srt!j"dd"}t| j#  tj|t$ W 5 Q R X t%| | j1rrt| j2  tj|t3 |d k	r| j-j4d k	r| j-j4}|5dsd| }|| }| j-j6rt7j8nt7j9}| jj:d ks| jj;d ks||| jj:r|| j_:|| j_;| j-j.r"| j<tj|t= t>? t7j>@ tj>A d}tjBC r|| j-jDtEjFkrltjBj>G |d< ntjBj>A |d< t rtA |d< tjH|dd | j-jIdkrt|tj|d n"t|tj|d| j-jJ d | j-jKr| L| | j-j.r| jMd|d d S )Nr  r  Tr  Zsaving_optimizer_statesrecordF)Zgather_if_shardr   )r  Zv3r  )r  r  r   r  r   r   r   r  r  r  r-  )NrJ   r	  r  r  rU  rW  r   r  r:  r  r   r   save_checkpointr   rT   r   r   Zconsolidate_state_dictr   r   r   rO  r}  r   r   Zfull_optim_state_dictr   r  r  r{   rV  rQ  r  r   catch_warningsr   r  rH   rx   Zlocal_state_dictr  r  Zrdp_rankr
  Zshard_optimizer_stater   r   r   r   r  r  SCALER_NAMEr  r  Zgreater_is_betterr  Zgreaterlessr  rP  r  r  r  getstateZ	get_stateZget_rng_stater  r  r   rb   r   Zget_rng_state_allr   rX  r  r   _push_from_checkpoint_rotate_checkpoints)r  r   r  r  checkpoint_folderra  r   full_osdcaught_warningsZopt_state_dictZis_deepspeed_custom_schedulerr  Zmetric_valueoperatorZ
rng_statesr   r   r   r  O	  s    
    
"
 &



"


zTrainer._save_checkpointc           
   	      s   dkrdS | j r^t| jtsZtjdd"}| jtt	j
 t W 5 Q R X t| dS t r|tt	j
 td nRt	j
t	j
 tpt	j
t	j
 tpt	j
 ot fddt	 D }|rt	j
t	j
 trt rtjt	j
 tdd}tjdd}tjt	j
 tdd}W 5 Q R X t| t|| jj t|| jj | j| | j| nvt rt	j
t	j
 d	r fd
d}n fdd}| j| n| jjdkr| jjnd}| js| j r`| j rt!| j"j#j$| j"| j| j%  nDd}| jj&dkr@tt	j
 t}| j%j'(|| j%}	| j|	 n | jtjt	j
 t|d tjdd"}| jtt	j
 t W 5 Q R X t| | j)rt	j
t	j
 t*r| j+tt	j
 t* dS )z3If optimizer and scheduler states exist, load them.NTr  _*c                 3   s6   | ].}t jt j |rtd d |kV  qdS rg  )r   r  rh  r:  OPTIMIZER_NAME_BINr  ri  rc  r   r   rk  	  s   z8Trainer._load_optimizer_and_scheduler.<locals>.<genexpr>r   rp  rm  c                    s"   | tjtj tdd d S )NTr  )r|  r  r{  r   r  r:  r  modoptr  r   r   opt_load_hook	  s    z<Trainer._load_optimizer_and_scheduler.<locals>.opt_load_hookc                    sH   t r&|tjtj tddd n|tjtj tdd d S )NT)r  Zback_compatr  )r  r|  r  r{  r   r  r:  r  r  r  r   r   r  	  s
    r   r   ),r   r   r   r   r   r  r|  r   r{  r   r  r:  r  rH   rx   globr  r:  r  rh  ry  rz  r{   rV  Zsend_cpu_data_to_devicer   r   r   r   Zregister_post_step_hookrX  r   r   r   rO  r	  r}  r   r  r   Zscatter_full_optim_state_dictr  r  r  )
r  rc  r  Zcheckpoint_file_existsZoptimizer_stateZlr_scheduler_stater  rq  r  Zsharded_osdr   r  r   r8  	  sp    &"& z%Trainer._load_optimizer_and_scheduler   minimizer  )r  r  n_trials	directionbackendr   r?  c           
      K   s   |dkrt  }t|}t|  }|  || _| jdkr@td|dkrN|jn|| _|| _	|dkrft
n|| _|j| ||f|}	d| _|	S )aD  
        Launch an hyperparameter search using `optuna` or `Ray Tune` or `SigOpt`. The optimized quantity is determined
        by `compute_objective`, which defaults to a function returning the evaluation loss when no metric is provided,
        the sum of all metrics otherwise.

        <Tip warning={true}>

        To use this method, you need to have provided a `model_init` when initializing your [`Trainer`]: we need to
        reinitialize the model at each new run. This is incompatible with the `optimizers` argument, so you need to
        subclass [`Trainer`] and override the method [`~Trainer.create_optimizer_and_scheduler`] for custom
        optimizer/scheduler.

        </Tip>

        Args:
            hp_space (`Callable[["optuna.Trial"], Dict[str, float]]`, *optional*):
                A function that defines the hyperparameter search space. Will default to
                [`~trainer_utils.default_hp_space_optuna`] or [`~trainer_utils.default_hp_space_ray`] or
                [`~trainer_utils.default_hp_space_sigopt`] depending on your backend.
            compute_objective (`Callable[[Dict[str, float]], float]`, *optional*):
                A function computing the objective to minimize or maximize from the metrics returned by the `evaluate`
                method. Will default to [`~trainer_utils.default_compute_objective`].
            n_trials (`int`, *optional*, defaults to 100):
                The number of trial runs to test.
            direction (`str` or `List[str]`, *optional*, defaults to `"minimize"`):
                If it's single objective optimization, direction is `str`, can be `"minimize"` or `"maximize"`, you
                should pick `"minimize"` when optimizing the validation loss, `"maximize"` when optimizing one or
                several metrics. If it's multi objectives optimization, direction is `List[str]`, can be List of
                `"minimize"` and `"maximize"`, you should pick `"minimize"` when optimizing the validation loss,
                `"maximize"` when optimizing one or several metrics.
            backend (`str` or [`~training_utils.HPSearchBackend`], *optional*):
                The backend to use for hyperparameter search. Will default to optuna or Ray Tune or SigOpt, depending
                on which one is installed. If all are installed, will default to optuna.
            hp_name (`Callable[["optuna.Trial"], str]]`, *optional*):
                A function that defines the trial/run name. Will default to None.
            kwargs (`Dict[str, Any]`, *optional*):
                Additional keyword arguments passed along to `optuna.create_study` or `ray.tune.run`. For more
                information see:

                - the documentation of
                  [optuna.create_study](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.create_study.html)
                - the documentation of [tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html#tune-run)
                - the documentation of [sigopt](https://app.sigopt.com/docs/endpoints/experiments/create)

        Returns:
            [`trainer_utils.BestRun` or `List[trainer_utils.BestRun]`]: All the information about the best run or best
            runs for multi-objective optimization. Experiment summary can be found in `run_summary` attribute for Ray
            backend.
        NzXTo use hyperparameter search, you need to pass your model through a model_init function.)r"   rO   r!   Zensure_availabler  r   r   Zdefault_hp_spacer  r   rW   r  rf  )
r  r  r  r  r  r  r   r  Zbackend_objZbest_runr   r   r   hyperparameter_search&
  s     ;

zTrainer.hyperparameter_search)r  r?  c                 C   sZ   | j jdk	rt| j jd|d< |d| j ji}| j j| | j| j| j | j	|| _	dS )z
        Log `logs` on the various objects watching training.

        Subclass and override this method to inject custom behavior.

        Args:
            logs (`Dict[str, float]`):
                The values to log.
        N   r9  r  )
r	  r9  r  r  Zlog_historyr  r   Zon_logr   r  )r  r  outputr   r   r   rV  u
  s
    
zTrainer.log)r   r?  c                    s   t |tr(t| fdd| D S t |ttfrPt| fdd|D S t |tjrd jj	i} j
rt|st|r|d jjjj i |jf |S |S )z|
        Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors.
        c                    s   i | ]\}}|  |qS r   _prepare_inputr  r  r   r   rs  
  s      z*Trainer._prepare_input.<locals>.<dictcomp>c                 3   s   | ]}  |V  qd S r  r  )r   r  r  r   r   rk  
  s     z)Trainer._prepare_input.<locals>.<genexpr>r   r  )r   r   r6  r  r  r+  r   Tensorr   r   r   Zis_floating_pointZ
is_complexr  rO  r	  r  r  r  r%  )r  r   r  r   r  r   r  
  s    
zTrainer._prepare_input)r`  r?  c                 C   sR   |  |}t|dkr.tdd| j d| jjdkrN| jdk	rN| j|d< |S )z
        Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
        handling potential state.
        r   zThe batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: r  r  NZmems)r  r   r   r:  r  r   r?  r'  r  r`  r   r   r   r  
  s    

zTrainer._prepare_inputsc                 C   s   |   S )zF
        A helper wrapper to group together context managers.
        )autocast_smart_context_managerr  r   r   r   compute_loss_context_manager
  s    z$Trainer.compute_loss_context_managerr  c                 C   sH   | j s| jr<| jr&tjjj|| jdntjjj|| jd}nt	 }|S )z
        A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired
        arguments, depending on the situation.
        )r  r  )
r  r  r   r   r   r  r  r  
contextlibnullcontext)r  r  Zctx_managerr   r   r   r  
  s    z&Trainer.autocast_smart_context_manager)r   r`  r?  c              	   C   s   |   | |}t r>t||| jj}|  | jj	S | 
  | ||}W 5 Q R X | jjdkrr| }| jr| j|  n6| jrt|| j}|  W 5 Q R X n| j| | | jj S )aq  
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (`nn.Module`):
                The model to train.
            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument `labels`. Check your model's documentation for all accepted arguments.

        Return:
            `torch.Tensor`: The tensor with training loss on this batch.
        r   )r  r  rx   r   r   rD  reduce_meandetachr%  r   r  compute_lossr  r  r  r  r$  Zbackwardr  r   Z
scale_lossr   rO  )r  r   r`  loss_mbr  Zscaled_lossr   r   r   rC  
  s     

zTrainer.training_stepc                 C   s  | j dk	rd|kr|d}nd}|f |}| jjdkrF|| jj | _|dk	rt rnt|trnt|j	
 }nt|
 }|t kr| j ||dd}q|  ||}nVt|trd|krtdd|  d	d|  d
t|tr|d n|d }|r
||fS |S )z
        How the loss is computed by Trainer. By default, all models return the loss in the first element.

        Subclass and override for custom behavior.
        Nlabelsr   T)Zshift_labelsr  zJThe model did not return a loss from the inputs, only the following keys: r  z,. For reference, the inputs it received are r  )r  r  r   r?  r'  ru   r   r   r)   Z
base_modelZ	_get_namer*   r   r  r   r:  r,  )r  r   r`  return_outputsr  outputsr@  r  r   r   r   r  
  s&    
$zTrainer.compute_lossc                 C   s   | j jdkS )z
        Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on several
        machines) main process.
        r   )r   Zlocal_process_indexr  r   r   r   r     s    zTrainer.is_local_process_zeroc                 C   s"   t  rt dkS | jjdkS dS )z
        Whether or not this process is the global main process (when training in a distributed fashion on several
        machines, this is only going to be `True` for one process).
        r   N)rx   r  rT  r   r  r  r   r   r   r     s    zTrainer.is_world_process_zero)r   r  c                 C   s  |dkr| j j}t r$| | nt rvtj|dd | j }| j j	rX| j
||d trrttj|d  nNtj| j jkstj| j jks| jdk	s| jr| js| j ni }| j j	r| j
||d | jrt| j j	|ttg t| jjj| j| j| n| jrt t!t dkr*t"dz*| j#| j$}| j j	rR| j
||d W nX t"k
r   t%&d | j j	r| j
|i d t| j j	|ttg | j'| Y nX n| j j	r| 
| | j j(r|s| j(d	d
 dS )z
        Will save the model, so you can reload it using `from_pretrained()`.

        Will only save from the main process.
        NTr   )r  rm  r   z#Install Accelerate from main branchz| stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead, use zero_to_fp32.py to recover weightsz
Model save)commit_message))r   r   r{   	_save_tpurx   r   r   r   r  r   _saver  r   r  r:  touchrT   r   r   r   r   r   r   rI   rk   ri   r   rO  r	  r}  r   r   r   rH  r   Zget_state_dictr   r   r  r  r   )r  r   r  r  r   r   r   r    sX    




zTrainer.save_modelr   c                 C   s  |d k	r|n| j j}td|  t rRtj|dd t	| j tj
|t td t| jtstt| jtrt| jj|| j j| j tj	d qtd | j }t	|tj
|t n| jj|| j jtj	d | jd k	r| j jr| j| d S )NSaving model checkpoint to Tr   Zsaving_checkpoint)is_main_processr  save_functionETrainer.model is not a `PreTrainedModel`, only saving its state dict.)r  r  )r   r   r   r   rV  Zis_master_ordinalr   r   r   r  r  r:  TRAINING_ARGS_NAMErQ  r   r   r'   r)   save_pretrainedr   r  rk   r   )r  r   r  r   r   r   r  [  s(    



zTrainer._save_tpuc                 C   s  |d k	r|n| j j}tj|dd td|  t s>tfnttf}t	| j
|s|d krd| j
 }t	t| j
|rt| j
j||| j jd qtd | j jrtj|tj|t qt|tj|t n| j
j||| j jd | jd k	r| j| t| j tj|t d S )NTr   r  )r  Zsafe_serializationr  )r   r   r   r   r   r   ru   r'   r   r   r   r  r)   r  r~  r  r   Z	save_filer  r:  ri   r  rk   r   r  )r  r   r  Zsupported_classesr   r   r   r  w  s2    

  
  zTrainer._savec                 C   sZ   | j jtjkr>| j jt| jg| j jd	 
 7  _d| _n| j j| j7  _d| _d S )Nrs  r   )r   r   rb   r   r	  r+  r=   r  r   rz  rT  r  r   r   r   rU    s    zTrainer.store_flosc                 C   s   g }dd t || dD }|D ]b}|rF|tj||f q&td| d|}|d k	r&| d k	r&|t	| d |f q&t
|}dd |D }| jjd k	r|tt | jj}	t|	t|d D ]&}
||
d	  ||
  ||
< ||
d	 < q|S )
Nc                 S   s    g | ]}t j|rt|qS r   )r   r  rh  r  )r   xr   r   r   r     s      z/Trainer._sorted_checkpoints.<locals>.<listcomp>z-*z.*z	-([0-9]+)r   c                 S   s   g | ]}|d  qS )r   r   )r   rc  r   r   r   r     s     r  r   )r   r  r  r   r  getmtimerematchgroupsr  sortedr	  rP  indexr  r>  r   )r  r   Zcheckpoint_prefixr.  Zordering_and_checkpoint_pathZglob_checkpointsr  Zregex_matchrb  Zbest_model_indexir   r   r   rX    s    $zTrainer._sorted_checkpointsc                 C   s   | j jd ks| j jdkrd S | j||d}t|| j jkr>d S | j j}| jjd k	rr| j jdkrr|d | jjkrrd}tdt|| }|d | }|D ]$}td| d t	j
|dd	 qd S )
Nr   r-  r   r#  r  r/  r0  T)ignore_errors)r   rY  rX  r   r	  rP  r1  r   r   r[  r\  )r  r.  r   rb  rY  Znumber_of_checkpoints_to_deleteZcheckpoints_to_be_deletedrc  r   r   r   r    s$    

zTrainer._rotate_checkpointsr  )r   r  r  r?  c           	   
   C   s  | j   | |}t }| jjr*| jn| j}||d| jdkrDdnd||d}| jj	| jj
 }| d|jkr||j| d 7 }|jt|||jt|j| d | |j tj| jjkrtt  | j| j| j| j|j| _| j |j |jS )a'  
        Run evaluation and returns metrics.

        The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
        (pass it to the init `compute_metrics` argument).

        You can also subclass and override this method to inject custom behavior.

        Args:
            eval_dataset (`Dataset`, *optional*):
                Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns
                not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
                method.
            ignore_keys (`List[str]`, *optional*):
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions.
            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
                "eval_bleu" if the prefix is "eval" (default)

        Returns:
            A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
            dictionary also contains the epoch number which comes from the training state.
        Z
EvaluationNT)r/  prediction_loss_onlyr  r  _jit_compilation_timer)  r*  )r   r   r[  r  r   rU  prediction_loopevaluation_loopr   rZ  rX  r  r  r`   r)  r2  r3  rV  r   rK  r{  rV  rL  rM  rN  r   Zon_evaluater	  r  r  )	r  r   r  r  eval_dataloaderr  	eval_loopr  total_batch_sizer   r   r   r    s8    


	zTrainer.evaluater]  )r\  r  r  r?  c           	   
   C   s   | j   | |}t }| jjr*| jn| j}||d||d}| jj| jj	 }| d|j
krt||j
| d 7 }|j
t|||jt|j| d | j| j| j| j|j
| _| j |j
 t|j|j|j
dS )a  
        Run prediction and returns predictions and potential metrics.

        Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
        will also return metrics, like in `evaluate()`.

        Args:
            test_dataset (`Dataset`):
                Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the
                `model.forward()` method are automatically removed. Has to implement the method `__len__`
            ignore_keys (`List[str]`, *optional*):
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions.
            metric_key_prefix (`str`, *optional*, defaults to `"test"`):
                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
                "test_bleu" if the prefix is "test" (default)

        <Tip>

        If your predictions or labels have different sequence length (for instance because you're doing dynamic padding
        in a token classification task) the predictions will be padded (on the right) to allow for concatenation into
        one array. The padding index is -100.

        </Tip>

        Returns: *NamedTuple* A namedtuple with the following keys:

            - predictions (`np.ndarray`): The predictions on `test_dataset`.
            - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
            - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
              labels).
        Z
Prediction)r/  r  r  r  r  )predictionsr'  r  )r   r   r^  r  r   rU  r  r  rZ  rX  r  r  r`   r)  r2  r3  r   Z
on_predictr	  r  r  rR   r  r'  )	r  r\  r  r  Ztest_dataloaderr  r  r  r  r   r   r   predict  s0    $

   	zTrainer.predict)r  r/  r  r  r  r?  c                  C   s  | j }|dk	r|n|j}| jr:| jdkr:t| ddd\}}| j| jd|d}t| jj	dkr|| jkr| jrx| j
|n| jj|dd}| jr|| _|| jk	r|| _| jr| j| _| js|jr|jtj|jd}n|jr|jtj|jd}| j j}	td	| d
 t|r&td| |  n
td td|	  |  || j_t|dd}
|jdkrnd| _ d}d}d}d}d}d}d}d}d}t!|D ]\}}t"|}|dk	r||7 }|	dkr|}	| j#||||d\}}}t| jdd}|j$r| %|| nd}t& rt'(  |dk	rT| j)|*|	}|dkrF|nt+||dd}|dk	rp| jj,|ddd}|dk	r| jj,|ddd}| j)|}|dkr|nt+||dd}|dk	r| jj,|ddd}| j-dk	r| -||}| j)|}|dkr|nt+||dd}|dk	rB| j)|}|dkr4|nt+||dd}| j.|| j/| j0| _0|j1dk	r|d |j1 dkr| jj2st34t5t34dkr|dk	rt6|}|dkr|nt7j8||fdd}|dk	rt6|}|dkr|nt+||dd}|dk	r&t6|}|dkr|nt+||dd}|dk	rTt6|}|dkrF|nt+||dd}d\}}}}q|jrt9| drt:| d |dk	rt6|}|dkr|nt7j8||fdd}|dk	rt6|}|dkr|nt+||dd}|dk	rt6|}|dkr|nt+||dd}|dk	r>t6|}|dkr0|nt+||dd}t|
rRt|
}n@t;|
t<rxt|
dddkrx|
j}nt|r| |}n|}|dkr|dkr|}| j=dk	r|dk	r|dk	r|j$r| =t>|||d}n| =t>||d}ni }t?|}|dk	r(|@ A || d< t9| drD| jB|| d< tC|D D ]0}|E| d sP|F||| d | < qPtG||||d!S )"
        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.

        Works both with or without labels.
        Nr   Tr`  Z	inferenceFrF  r  Zevaluation_moder  r   ***** Running  *****r"  z  Num examples: Unknown  Batch size = r.  r  main_input_namer  Zpadding_indexr   )dimZ	pad_indexr   )ZaxisNNNNr'  r  r  r'  r`  r  r'  _lossr  r  r_  r  r'  r  r)  )Hr   r  r   r   r#   r  r   r   rO  _modelsrP  prepare_modelr   r   r   r   r%  r   r  r   r   r  rZ  r   r   r\   r  r  r   r  r   r?  r'  r  r?   prediction_stepinclude_inputs_for_metricsr  r{   rV  r
  Zgather_for_metricsrepeatrD   Zpad_across_processesr   on_prediction_stepr	  r  eval_accumulation_stepsZsync_gradientsr   r   rH  rF   r  Zconcatenater   rO  r   r9   r   rM   rX   r  rT  r  r+  r,  r  r  rL   ) r  r  r/  r  r  r  r   r_  r   rI  r   losses_host
preds_hostlabels_hostinputs_hostZ
all_lossesZ	all_predsZ
all_labelsZ
all_inputsZobserved_num_examplesr  r`  Zobserved_batch_sizer  logitsr  r  inputs_decodelossesr)  r  r  r   r   r   r  Z  s   










 




 





 
zTrainer.evaluation_loopc                 C   sx   |dkrdS t  r*|dkrd}t||}nJt r:t|}n:| jjdk	rT| jjjdksl| jjdkrt| jjdkrtt|}|S )
        Gather value of `tensors` (tensor or list/tuple of nested tensors) and convert them to numpy before
        concatenating them to `gathered`
        NZnested_gatherNOr#  )	r{   rG   rx   r   r   Zdistributed_stateZdistributed_typeZ
local_rankr>   r  Ztensorsrg  r   r   r   r  2  s    


zTrainer._nested_gather)r   r`  r  r  r?  c              
      s  t | jdkrdntfdd| jD }dd}|dkrD| j}t | jdkrZ|rZdnd}|  dkrt| jdrt| jj	d	g  ng  |s|rt
tfd
d| jD }t |dkr|d }nd}t  t rt|}	|s|rVt|	tr(|	d }
t fdd|	 D }n|	d }
|	dd }|
   }t|}n8d}t|	trt fdd|	 D }n|	}t|}n|s|r|   | j|dd\}}W 5 Q R X |  }t|trt fdd| D }n|dd }nnd}|   |f }W 5 Q R X t|trPt fdd| D }n|}| jjdkrt|| jjd  | _W 5 Q R X |r|ddfS t
|}t |dkr|d }|||fS )a  
        Perform an evaluation step on `model` using `inputs`.

        Subclass and override to inject custom behavior.

        Args:
            model (`nn.Module`):
                The model to evaluate.
            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument `labels`. Check your model's documentation for all accepted arguments.
            prediction_loss_only (`bool`):
                Whether or not to return the loss only.
            ignore_keys (`List[str]`, *optional*):
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions.

        Return:
            Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
            logits and labels (each being optional).
        r   Fc                 3   s   | ]}  |d k	V  qd S r  r   r3  r`  r   r   rk  c  s     z*Trainer.prediction_step.<locals>.<genexpr>return_lossNTr  Zkeys_to_ignore_at_inferencec                 3   s   | ]}  |V  qd S r  r  rf  r  r   r   rk  u  s     r   r  c                 3   s$   | ]\}}| d g kr|V  qdS r  Nr   r  r  r   r   rk    s      c                 3   s   | ]\}}| kr|V  qd S r  r   r  r  r   r   rk    s      )r  c                 3   s$   | ]\}}| d g kr|V  qdS r  r   r  r  r   r   rk    s      c                 3   s   | ]\}}| kr|V  qd S r  r   r  r  r   r   rk    s      )r   r  allr   rm   r  r   r   r   r  rE   r  r   r  rx   r   r   r  r  r  r  r   r   r  r  r  r   r?  r'  )r  r   r`  r  r  Z
has_labelsr  Zloss_without_labelsr  Zraw_outputsr  Z	logits_mbr  r  r  r   )r  r`  r   r  E  sh    *








zTrainer.prediction_stepr  c                 C   s    t | jdr| j|S dS dS )a  
        For models that inherit from [`PreTrainedModel`], uses that method to compute the number of floating point
        operations for every backward + forward pass. If using another model, either implement such a method in the
        model or subclass and override this method.

        Args:
            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

        Returns:
            `int`: The number of floating-point operations.
        rG  r   N)r   r   rG  r  r   r   r   rG    s    zTrainer.floating_point_opsc                 C   s^   |   sdS | jjdkr,t| jj j}n| jj}t|| jj| jj	dd}|j
| _d| _dS )zE
        Initializes a git repo in `self.args.hub_model_id`.
        NT)tokenprivater   )r   r   r   r   r   absoluterg  r   	hub_tokenhub_private_reporepo_idpush_in_progress)r  	repo_namerepo_urlr   r   r   r     s    zTrainer.init_hf_repo)at_initc              	   C   sD  t d |  sdS | jj}|dkr8t| jj j}t	|| jj
| jjddj}zt| jj|| jj
d| _W nJ tk
r   | jjr|rt| jj t| jj|| jj
d| _n Y nX | j  tjtj| jjds$| jjtjkr$ttj| jjdddd	}|d
g W 5 Q R X tjdr:|   d| _ dS )a  
        Initializes a git repo in `self.args.hub_model_id`.

        <Tip warning={true}>

        This function is deprecated and will be removed in v4.34.0 of Transformers.

        </Tip>

        Args:
            at_init (`bool`, *optional*, defaults to `False`):
                Whether this function is called before any training or not. If `self.args.overwrite_output_dir` is
                `True` and `at_init` is `True`, the path to the repo (which is `self.args.output_dir`) might be wiped
                out.
        zy`Trainer.init_git_repo` is deprecated and will be removed in v4.34.0 of Transformers. Use `Trainer.init_hf_repo` instead.NT)r  r  r  r   )Z
clone_fromr  
.gitignorewzutf-8)encodingzcheckpoint-*/ZSM_TRAINING_ENV)!r   r   r   r   r   r   r   r  rg  r   r  r  r  r   repoEnvironmentErrorZoverwrite_output_dirr[  r\  Zgit_pullr   r  r  r:  hub_strategyrP   ALL_CHECKPOINTSopen
writelinesenvironr   _add_sm_patterns_to_gitignorer  )r  r  r  r  writerr   r   r   init_git_repo  s>       
zTrainer.init_git_repo	languagelicensetagsr@  finetuned_fromtasksdataset_tagsr.  dataset_argsc
                 C   sd   |   sdS tj| |||||||||	d
}
|
 }ttj| jj	dd}|
| W 5 Q R X dS )a  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            language (`str`, *optional*):
                The language of the model (if applicable)
            license (`str`, *optional*):
                The license of the model. Will default to the license of the pretrained model used, if the original
                model given to the `Trainer` comes from a repo on the Hub.
            tags (`str` or `List[str]`, *optional*):
                Some tags to be included in the metadata of the model card.
            model_name (`str`, *optional*):
                The name of the model.
            finetuned_from (`str`, *optional*):
                The name of the model used to fine-tune this one (if applicable). Will default to the name of the repo
                of the original model given to the `Trainer` (if it comes from the Hub).
            tasks (`str` or `List[str]`, *optional*):
                One or several task identifiers, to be included in the metadata of the model card.
            dataset_tags (`str` or `List[str]`, *optional*):
                One or several dataset tags, to be included in the metadata of the model card.
            dataset (`str` or `List[str]`, *optional*):
                One or several dataset identifiers, to be included in the metadata of the model card.
            dataset_args (`str` or `List[str]`, *optional*):
               One or several dataset arguments, to be included in the metadata of the model card.
        Nr(  z	README.mdr  )r   r&   Zfrom_trainerZto_model_cardr"  r   r  r:  r   r   write)r  r)  r*  r+  r@  r,  r-  r.  r.  r/  Ztraining_summaryZ
model_cardrl  r   r   r   create_model_card
  s"    %zTrainer.create_model_cardc           
      C   s  |   r| jjtjkrd S | jjs:| jd k	r:| j s:d S | jj}t	t
tg}t rb|tttg |D ]:}tjtj||rfttj||tj|| qf| jd k	r| j| t| jtj|t | jjtjkrd| jj  }ndt!| jj" }t#| j$||| jj%dddgd}|g}| jjtj&tj'fkr| jjtj&krNdnt(|j)}t#| j$|||d | jj%dd	}	|*|	 | jd ks| j rt+|| _n| jj,| d S )
NzTraining in progress, step zTraining in progress, epoch Tr  **/*r  folder_pathr  r  run_as_futureignore_patternszlast-checkpointz, checkpoint)r  r4  path_in_repor  r  r5  )-r   r   r   rP   ZENDZhub_always_pushr  is_doner   rg   rk   ri   ru   extendrd   rf   re   r   r  r:  r:  r[  r  r   r  r   r  r  Zsave_strategyrQ   ZSTEPSr	  r  r  r9  r   r   r  
CHECKPOINTr!  r   rg  r  rl   jobs)
r  r  r   Zmodeling_filesZmodeling_filer  Zmodel_push_jobZ	push_jobsr7  Zcheckpoint_pushr   r   r   r  B  sR    
"
	
zTrainer._push_from_checkpointc                 C   s:   t | dsd S | jd k	r6| j s6td | j  d S )Nr  z\Waiting for the current checkpoint push to be finished, this might take a couple of minutes.)r   r  r8  r   r   Zwait_until_doner  r   r   r   r]  {  s
    

zTrainer._finish_current_pushEnd of training)r  blockingr?  c                 K   s   | dd}|dkrJ| jjrJ| jjdkr8t| jjj}n| jjdd }| jdkr\|   | j	dd | 
 stdS | jf d|i| |   t| j| jj|| jj| ddgd	S )
u  
        Upload `self.model` and `self.tokenizer` to the 🤗 model hub on the repo `self.args.hub_model_id`.

        Parameters:
            commit_message (`str`, *optional*, defaults to `"End of training"`):
                Message to commit while pushing.
            blocking (`bool`, *optional*, defaults to `True`):
                Whether the function should return only when the `git push` has finished.
            kwargs (`Dict[str, Any]`, *optional*):
                Additional keyword arguments passed along to [`~Trainer.create_model_card`].

        Returns:
            The URL of the repository where the model was pushed if `blocking=False`, or a `Future` object tracking the
            progress of the commit if `blocking=True`.
        r@  N/r#  Tr  r  r2  r3  )r  r   r   r   r   r   rg  r  r   r  r   r1  r]  r   r  )r  r  r=  r  r@  r   r   r   r     s(    
zTrainer.push_to_hubc           #      C   s  | j }t|std|dk	r"|n|j}| jrJ| jdkrJt| ddd\}}| j| jd|d}t	| j
jdkr|| jkr| jr| j
|n| j
j|dd}| jr|| _|| jk	r|| _| jr| j| _| js|jr|jtj|jd	}n|jr|jtj|jd	}|j}	| |}
td
| d td|
  td|	  d}d}d}d}td|j}t||
|	d}|sd}t|drt |j!t"r|j!j}t||
|d}t||
|d}t||
|d}|#  |j$dkrd| _%|| j&_'t(|D ]\}}| j)||||d\}}}t*| jdd}|j+r,| ,|| nd}|dk	rd|-|	}|dkrR|ntj.||fdd}|dk	r|dkr||nt/||dd}|dk	r|dkr|nt/||dd}|dk	r|dkr|nt/||dd}| j&0|| j1| j2| _2|j3dk	r|d |j3 dkr|4| 5|d |sZ|4| 5|d |4| 5|d |4| 5|d d\}}}}q|j$rt| drt6| d |4| 5|d |s|4| 5|d |4| 5|d |4| 5|d |7 }|s|7 nd}|s|7 nd}|s|7 nd} | j8dk	rf|dk	rf|dk	rf|j+rR| 8t9||| d}!n| 8t9||d}!ni }!t:|!}!|dk	r|; < |!| d< t=|!> D ]0}"|"?| d s|!@|"|!| d |" < qtA|||!|
d!S )"r  z+dataloader must implement a working __len__Nr   Tr  Fr  r  r  r  r  r"  r  r   )make_multiple_ofrJ  r  r  r  )r  r  r  Zeval_lossesZ
eval_predsZeval_label_idsZeval_inputs_idsr  r'  r  r  r  r_  r  )Br   r\   r   r  r   r   r#   r  r   r   rO  r  rP  r  r   r   r   r   r%  r   r  r   r   r  rI  r  r   r   r1  rX  r8   r   r   rJ  r<   r  r?  r'  r   r  r  r  r   r  r  r  catrD   r   r	  r  r  Z
add_arrays_gather_and_numpifyrO  finalizer   rM   rX   r  rT  r+  r,  r  r  rL   )#r  r  r/  r  r  r  r   r_  r   rI  r  r  r  r  r  rX  Zeval_losses_gathererr?  Zpreds_gathererZlabels_gathererZinputs_gathererr  r`  r  r  r  r  r  r  Z	eval_losspredsr'  Z
inputs_idsr  r  r   r   r   r    s    



 


 
 
zTrainer.prediction_loopc                 C   sL   |dkrdS t  rt||}n&t r.t|}n| jjtjkrDt|}t	|S )r	  N)
r{   rG   rx   r   r   r   rb   r   r>   rF   r  r   r   r   rA  O  s    
zTrainer._gather_and_numpifyc              	   C   s  |   sdS ddg}tjtj| jjdr\ttj| jjdd}| }W 5 Q R X nd}|}|D ].}||krh|	dr||7 }qh|d| 7 }qh||krttj| jjdd }t
d	|  || W 5 Q R X | jd td
 | j s| jd | j  dS )z8Add SageMaker Checkpointing patterns to .gitignore file.Nz*.sagemaker-uploadingz*.sagemaker-uploadedr  rr0  
r  z"Writing .gitignore file. Content: g      ?z'Add *.sagemaker patterns to .gitignore.)r   r   r  r  r:  r  Z	local_dirr"  readendswithr   r{  r0  Zgit_addr  sleepZis_repo_cleanZ
git_commitZgit_push)r  patternsrl  Zcurrent_contentcontentpatternr   r   r   r%  _  s,    


z%Trainer._add_sm_patterns_to_gitignorec                 C   s(  d| j ji}tttdkr(d|d< tf |}t| j j| j j|d| _	t
| j	jdd d k	| _t
| j	jdd d k	| _| jr| j	jj}| j jd|j|_td	r| j jd
|j|_|jr| j jrtd| jr$t
| j dd d kr$ddlm} | j	jj}||jj|_|jj|_|j| j  d S )Nr*  r   FZsync_with_dataloader)dispatch_batchesr  gradient_accumulation_pluginr  r}  r   z0.23.0activation_checkpointingzThe activation_checkpointing in FSDP config and the gradient_checkpointing in training arg can't be set to True simultaneously. Please use FSDP's activation_checkpointing logic when using FSDP.r  r   r  )r   rD  r   r   rH  r   r   rL  r  rO  r   r	  r   r   r}  r   r   r   ro   rN  r7  r   r  r  r  r  Zdeepspeed_configr  )r  Zgrad_acc_kwargsrM  r}  r  Z	ds_pluginr   r   r   r     sB    

  

z*Trainer.create_accelerator_and_postprocess)NNNNNNNNNr   N)N)N)N)N)N)N)F)TN)NNN)NNNNN)N)N)NNr  r  NN)T)F)NF)N)NN)FN)NNr  )Nr]  )NNr  )N)N)F)	NNNNNNNNN)r<  T)NNr  )tr   
__module____qualname____doc__trainer_pt_utilsr   r   r   r   r   r   r'   r   Modulerc   r	   r   r   r   r  r0   r   rM   r   r5   r
   r   rq  Z	Optimizerr   ZLambdaLRr  r  r   r"  r#  r   r-  r>  rA  r   r   ZSamplerrE  r   rR  rY  r[  r^  r  re  ri  rb  staticmethodr   rw  rc  r  r  r  rF  r  r  r   r  r  r  r  r  r  r   rW  r  rS  r  rJ  r@  r  r8  rO   rK   r  rV  r  r  r  r  rC  r  r   r   r  r  r  rU  rJ   rX  r  r  rR   r  rL   r  r  r  rG  r   r'  r1  r  r]  r   r  rA  r%  r   r   r   r   r   r      s  S           
  |  !" 7{1*

+
    
i         
   X
lV0)
z_      O$4()
$="     

H    
 B   
 Y
 
i ;         899   
 'r   )rQ  r  r  r  r  importlib.metadatar  r(  r2  r   r  r  r[  r5  r  r   collections.abcr   pathlibr   typingr   r   r   r   r   r	   r
   r   Zintegrationsr   r   r   Zhuggingface_hub.utilsr   r!  r  r  r   Ztorch.distributeddistributedrR  Zhuggingface_hubr   r   r   	packagingr   r   Ztorch.utils.datar   r   r   r   r0  r   Zconfiguration_utilsr   Zdata.data_collatorr   r   r   Zdebug_utilsr   r   Zdependency_versions_checkr    r  r!   r"   Zintegrations.deepspeedr#   r$   r%   Z	modelcardr&   Zmodeling_utilsr'   r(   r)   Zmodels.auto.modeling_autor*   r+   r  r,   r-   Zpytorch_utilsr.   r/   Ztokenization_utils_baser0   Ztrainer_callbackr1   r2   r3   r4   r5   r6   r7   rR  r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   Ztrainer_utilsrJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   Ztraining_argsra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   Zutils.quantization_configr~   r   r   Zutils.notebookr   r   r   r;  Ztorch_xla.core.xla_modelcoreZ	xla_modelrV  Ztorch_xla.debug.metricsr{  r  rM  r   Zfairscale.nn.data_parallelr   r   r   r  Zfairscale.nn.wrapr   Zfairscale.optimr   Zfairscale.optim.grad_scalerr   Z!smdistributed.modelparallel.torchZmodelparallelr  Zsmdistributed.modelparallelZSMP_VERSIONr   r  r   r   r   r   Zsafetensors.torchr  Zpeftr   Z
accelerater   r   rH  r  r   r   r   r   r   r   r   r  Z
get_loggerr   r   r  r  r  r  r  r  r   r   r   r   r   <module>   s   ($	Pdp
