U
    ,È-eË  ã                   @   sT   d Z ddlmZmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZ G dd„ deƒZd	S )
z
Processor class for MarkupLM.
é    )ÚOptionalÚUnioné   )Ú
TensorType)ÚProcessorMixin)ÚBatchEncodingÚPaddingStrategyÚTruncationStrategyc                   @   s”   e Zd ZdZdZdZdZdeeee	e
f eee	ef ee eee ee ee eeeeeeee	ef  edœd	d
„Zdd„ Zdd„ Zedd„ ƒZdS )ÚMarkupLMProcessoraJ  
    Constructs a MarkupLM processor which combines a MarkupLM feature extractor and a MarkupLM tokenizer into a single
    processor.

    [`MarkupLMProcessor`] offers all the functionalities you need to prepare data for the model.

    It first uses [`MarkupLMFeatureExtractor`] to extract nodes and corresponding xpaths from one or more HTML strings.
    Next, these are provided to [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`], which turns them into token-level
    `input_ids`, `attention_mask`, `token_type_ids`, `xpath_tags_seq` and `xpath_subs_seq`.

    Args:
        feature_extractor (`MarkupLMFeatureExtractor`):
            An instance of [`MarkupLMFeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`MarkupLMTokenizer` or `MarkupLMTokenizerFast`):
            An instance of [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`]. The tokenizer is a required input.
        parse_html (`bool`, *optional*, defaults to `True`):
            Whether or not to use `MarkupLMFeatureExtractor` to parse HTML strings into nodes and corresponding xpaths.
    ZMarkupLMFeatureExtractor)ZMarkupLMTokenizerZMarkupLMTokenizerFastTNFr   )Úadd_special_tokensÚpaddingÚ
truncationÚ
max_lengthÚstrideÚpad_to_multiple_ofÚreturn_token_type_idsÚreturn_attention_maskÚreturn_overflowing_tokensÚreturn_special_tokens_maskÚreturn_offsets_mappingÚreturn_lengthÚverboseÚreturn_tensorsÚreturnc                 K   sê   | j rR|dkrtdƒ‚|dk	s.|dk	s.|dk	r6tdƒ‚|  |¡}|d }|d }n(|dk	rbtdƒ‚|dksr|dkrztdƒ‚|dk	r˜| j r˜t|tƒr˜|g}| jf |dk	rª|n||dk	r¸|nd||||||	|
|||||||||dœ|—Ž}|S )	aÆ  
        This method first forwards the `html_strings` argument to [`~MarkupLMFeatureExtractor.__call__`]. Next, it
        passes the `nodes` and `xpaths` along with the additional arguments to [`~MarkupLMTokenizer.__call__`] and
        returns the output.

        Optionally, one can also provide a `text` argument which is passed along as first sequence.

        Please refer to the docstring of the above two methods for more information.
        NzDMake sure to pass HTML strings in case `parse_html` is set to `True`zUPlease don't pass nodes, xpaths nor node labels in case `parse_html` is set to `True`ÚnodesÚxpathsz@You have passed HTML strings but `parse_html` is set to `False`.zIMake sure to pass nodes and xpaths in case `parse_html` is set to `False`)ÚtextZ	text_pairr   Únode_labelsr   r   r   r   r   r   r   r   r   r   r   r   r   r   )Ú
parse_htmlÚ
ValueErrorZfeature_extractorÚ
isinstanceÚstrÚ	tokenizer)ÚselfZhtml_stringsr   r   r   Z	questionsr   r   r   r   r   r   r   r   r   r   r   r   r   r   ÚkwargsÚfeaturesZencoded_inputs© r&   úq/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/transformers/models/markuplm/processing_markuplm.pyÚ__call__0   sP    !ÿ


îízMarkupLMProcessor.__call__c                 O   s   | j j||ŽS )z¾
        This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
        to the docstring of this method for more information.
        )r"   Úbatch_decode©r#   Úargsr$   r&   r&   r'   r)   €   s    zMarkupLMProcessor.batch_decodec                 O   s   | j j||ŽS )z¸
        This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
        docstring of this method for more information.
        )r"   Údecoder*   r&   r&   r'   r,   ‡   s    zMarkupLMProcessor.decodec                 C   s   | j j}|S )N)r"   Úmodel_input_names)r#   Ztokenizer_input_namesr&   r&   r'   r-   Ž   s    z#MarkupLMProcessor.model_input_names)NNNNNTFNNr   NNNFFFFTN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Zfeature_extractor_classZtokenizer_classr   Úboolr   r!   r   r	   r   Úintr   r   r(   r)   r,   Úpropertyr-   r&   r&   r&   r'   r
      sX                      ìêPr
   N)r1   Útypingr   r   Z
file_utilsr   Zprocessing_utilsr   Ztokenization_utils_baser   r   r	   r
   r&   r&   r&   r'   Ú<module>   s
   