U
    ,-e                     @   s@   d Z ddlZddlZddlmZ ddlmZ G dd deZdS )z
Processor class for Donut.
    N)contextmanager   )ProcessorMixinc                       sx   e Zd ZdZddgZdZdZd fdd	Zd	d
 Zdd Z	dd Z
edd ZdddZedd Zedd Z  ZS )DonutProcessora  
    Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single
    processor.

    [`DonutProcessor`] offers all the functionalities of [`DonutImageProcessor`] and
    [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. See the [`~DonutProcessor.__call__`] and
    [`~DonutProcessor.decode`] for more information.

    Args:
        image_processor ([`DonutImageProcessor`]):
            An instance of [`DonutImageProcessor`]. The image processor is a required input.
        tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]):
            An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
    image_processor	tokenizerZAutoImageProcessorZAutoTokenizerNc                    sr   d }d|kr"t dt |d}|d k	r.|n|}|d krBtd|d krRtdt || | j| _d| _	d S )Nfeature_extractorzhThe `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor` instead.z)You need to specify an `image_processor`.z"You need to specify a `tokenizer`.F)
warningswarnFutureWarningpop
ValueErrorsuper__init__r   current_processor_in_target_context_manager)selfr   r   kwargsr   	__class__ k/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/donut/processing_donut.pyr   ,   s    
zDonutProcessor.__init__c                 O   s   | j r| j||S |dd}|dd}t|dkrJ|d }|dd }|dkrb|dkrbtd|dk	r|| j|f||}|dk	r| j|f|}|dkr|S |dkr|S |d |d< |S dS )	a  
        When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
        [`~AutoImageProcessor.__call__`] and returns its output. If used in the context
        [`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's
        [`~DonutTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
        imagesNtextr      zBYou need to specify either an `images` or `text` input to process.Z	input_idslabels)r   r   r   lenr   r   r   )r   argsr   r   r   inputs	encodingsr   r   r   __call__@   s&    zDonutProcessor.__call__c                 O   s   | j j||S )z
        This method forwards all its arguments to DonutTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
        to the docstring of this method for more information.
        )r   batch_decoder   r   r   r   r   r   r!   a   s    zDonutProcessor.batch_decodec                 O   s   | j j||S )z
        This method forwards all its arguments to DonutTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
        docstring of this method for more information.
        )r   decoder"   r   r   r   r#   h   s    zDonutProcessor.decodec                 c   s0   t d d| _| j| _dV  | j| _d| _dS )z
        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning TrOCR.
        z`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your labels by using the argument `text` of the regular `__call__` method (either in the same call as your images inputs, or in a separate call.TNF)r	   r
   r   r   r   r   r   r   r   r   as_target_processoro   s    z"DonutProcessor.as_target_processorFc                 C   s   |dkr| j  }i }|rtd|tj}|dkr8q|d}t|}td| d|tj}| }|dkr||d}q| }t|}	t|}
t|	 d|
 |tj}|dk	r|d }d|krd|kr| j	|d	|d
}|rt
|dkr|d }|||< ng ||< |dD ]R}| }||krn|d dkrn|dd dkrn|dd }|| | q,t
|| dkr|| d ||< |||t
| d  }|dd dkr|g| j	|dd d	|d
 S qt
|r
|r|gS |S |rg S d|iS dS )zS
        Convert a (generated) token sequence into an ordered JSON format.
        Nz	<s_(.*?)>r   z</s_> z(.*?)z<s_T)is_inner_valueadded_vocabr   z<sep/><z/>   Ztext_sequence)r   Zget_added_vocabresearch
IGNORECASEgroupescapereplacestrip
token2jsonr   splitappendfind)r   tokensr(   r)   outputZstart_tokenkeyZkey_escapedZ	end_tokenZstart_token_escapedZend_token_escapedcontentvalueleafr   r   r   r4      sN    






* 
zDonutProcessor.token2jsonc                 C   s   t dt | jS )Nzg`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.)r	   r
   r   image_processor_classr$   r   r   r   feature_extractor_class   s
    z&DonutProcessor.feature_extractor_classc                 C   s   t dt | jS )Nz[`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.)r	   r
   r   r   r$   r   r   r   r      s
    z DonutProcessor.feature_extractor)NN)FN)__name__
__module____qualname____doc__
attributesr>   Ztokenizer_classr   r    r!   r#   r   r%   r4   propertyr?   r   __classcell__r   r   r   r   r      s   !

4
r   )rC   r-   r	   
contextlibr   Zprocessing_utilsr   r   r   r   r   r   <module>   s
   