U
    9%eX                     @   s   d dl mZmZ ddlmZmZmZmZmZm	Z	 ddl
mZmZ e r^d dlmZ ddlmZ e rpddlmZ e rd d	lZdd
lmZ eeZeeG dd deZd	S )    )ListUnion   )add_end_docstringsis_tf_availableis_torch_availableis_vision_availableloggingrequires_backends   )PIPELINE_INIT_ARGSPipeline)Image)
load_image)'TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMESN)$MODEL_FOR_VISION_2_SEQ_MAPPING_NAMESc                       sl   e Zd ZdZ fddZdddZeeee ded f d fd	d
Z	dddZ
dddZdd Z  ZS )ImageToTextPipelinea  
    Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
    >>> captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
    [{'generated_text': 'two birds are standing next to each other '}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This image to text pipeline can currently be loaded from pipeline() using the following task identifier:
    "image-to-text".

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-to-text).
    c                    s4   t  j|| t| d | | jdkr*tnt d S )NZvisiontf)super__init__r
   Zcheck_model_type	frameworkr   r   )selfargskwargs	__class__ c/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/pipelines/image_to_text.pyr   6   s
    
zImageToTextPipeline.__init__Nc                 C   sz   i }i }|d k	r||d< |d k	r(||d< |d k	r8||d< |d k	rpd|krPi |d< d|d krdt d||d d< ||i fS )Nprompttimeoutgenerate_kwargsmax_new_tokenszp'max_new_tokens' is defined twice, once in 'generate_kwargs' and once as a direct parameter, please use only one)
ValueError)r   r!   r    r   r   Zforward_kwargsZpreprocess_paramsr   r   r   _sanitize_parameters=   s"    z(ImageToTextPipeline._sanitize_parameterszImage.Image)imagesc                    s   t  j|f|S )a  
        Assign labels to the image(s) passed as inputs.

        Args:
            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                The pipeline handles three types of images:

                - A string containing a HTTP(s) link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images.

            max_new_tokens (`int`, *optional*):
                The amount of maximum tokens to generate. By default it will use `generate` default.

            generate_kwargs (`Dict`, *optional*):
                Pass it to send all of these arguments directly to `generate` allowing full control of this function.
            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                the call may block forever.

        Return:
            A list or a list of list of `dict`: Each result comes as a dictionary with the following key:

            - **generated_text** (`str`) -- The generated text.
        )r   __call__)r   r$   r   r   r   r   r%   S   s    zImageToTextPipeline.__call__c                 C   s*  t ||d}|d k	rt|ts2tdt| d| jjj}|dkr| j|| j	d}| j
|ddj}| j
jg| }t|d}|d	|i n`|d
kr| j||| j	d}nD|dkr| j|| j	d}| j
|| j	d}|| ntd| dn| j|| j	d}| jjjdkr&|d kr&d |d	< |S )N)r   z&Received an invalid text input, got - zy - but expected a single string. Note also that one single text can be provided for conditional image to text generation.git)r$   return_tensorsF)textZadd_special_tokensr   	input_idsZ
pix2struct)r$   Zheader_textr'   zvision-encoder-decoder)r'   zModel type z- does not support conditional text generation)r   
isinstancestrr"   typemodelconfig
model_typeZimage_processorr   	tokenizerr)   Zcls_token_idtorchZtensorZ	unsqueezeupdate)r   imager   r   r/   model_inputsr)   Ztext_inputsr   r   r   
preprocessq   s0    

zImageToTextPipeline.preprocessc                 C   sf   d|kr4t |d tr4tdd |d D r4d |d< |d kr@i }|| jj}| jj|f||}|S )Nr)   c                 s   s   | ]}|d kV  qd S )Nr   ).0xr   r   r   	<genexpr>   s     z/ImageToTextPipeline._forward.<locals>.<genexpr>)r*   listallpopr-   Zmain_input_namegenerate)r   r4   r    inputsmodel_outputsr   r   r   _forward   s    zImageToTextPipeline._forwardc                 C   s0   g }|D ]"}d| j j|ddi}|| q|S )NZgenerated_textT)Zskip_special_tokens)r0   decodeappend)r   r>   recordsZ
output_idsrecordr   r   r   postprocess   s     zImageToTextPipeline.postprocess)NNNN)NN)N)__name__
__module____qualname____doc__r   r#   r   r+   r   r%   r5   r?   rD   __classcell__r   r   r   r   r      s   
&
'
r   )typingr   r   utilsr   r   r   r   r	   r
   baser   r   ZPILr   Zimage_utilsr   Zmodels.auto.modeling_tf_autor   r1   Zmodels.auto.modeling_autor   Z
get_loggerrE   loggerr   r   r   r   r   <module>   s    
