U
    &%e,  ã                   @   sL   d dl Z d dlZd dlmZmZ ddlmZmZ e d¡Z	G dd„ dƒZ
dS )é    N)ÚOptionalÚUnioné   )ÚLanguageFilterÚProbingStates%   [a-zA-Z]*[€-ÿ]+[a-zA-Z]*[^a-zA-Z€-ÿ]?c                   @   sì   e Zd ZdZejfeddœdd„Zddœdd„Zee	e
 dœd	d
„ƒZee	e
 dœdd„ƒZeeef edœdd„Zeedœdd„ƒZedœdd„Zeeeef edœdd„ƒZeeeef edœdd„ƒZeeeef edœdd„ƒZdS )ÚCharSetProbergffffffî?N)Úlang_filterÚreturnc                 C   s$   t j| _d| _|| _t t¡| _d S )NT)	r   Ú	DETECTINGÚ_stateÚactiver   ÚloggingÚ	getLoggerÚ__name__Úlogger)Úselfr   © r   ú`/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/pip/_vendor/chardet/charsetprober.pyÚ__init__,   s    zCharSetProber.__init__)r	   c                 C   s   t j| _d S ©N)r   r
   r   ©r   r   r   r   Úreset2   s    zCharSetProber.resetc                 C   s   d S r   r   r   r   r   r   Úcharset_name5   s    zCharSetProber.charset_namec                 C   s   t ‚d S r   ©ÚNotImplementedErrorr   r   r   r   Úlanguage9   s    zCharSetProber.language)Úbyte_strr	   c                 C   s   t ‚d S r   r   )r   r   r   r   r   Úfeed=   s    zCharSetProber.feedc                 C   s   | j S r   )r   r   r   r   r   Ústate@   s    zCharSetProber.statec                 C   s   dS )Ng        r   r   r   r   r   Úget_confidenceD   s    zCharSetProber.get_confidence)Úbufr	   c                 C   s   t  dd| ¡} | S )Ns   ([ -])+ó    )ÚreÚsub)r    r   r   r   Úfilter_high_byte_onlyG   s    z#CharSetProber.filter_high_byte_onlyc                 C   sZ   t ƒ }t | ¡}|D ]@}| |dd… ¡ |dd… }| ¡ sJ|dk rJd}| |¡ q|S )u7  
        We define three types of bytes:
        alphabet: english alphabets [a-zA-Z]
        international: international characters [Â€-Ã¿]
        marker: everything else [^a-zA-ZÂ€-Ã¿]
        The input buffer can be thought to contain a series of words delimited
        by markers. This function works to filter all words that contain at
        least one international character. All contiguous sequences of markers
        are replaced by a single space ascii character.
        This filter applies to all scripts which do not use English characters.
        Néÿÿÿÿó   €r!   )Ú	bytearrayÚINTERNATIONAL_WORDS_PATTERNÚfindallÚextendÚisalpha)r    ÚfilteredÚwordsÚwordÚ	last_charr   r   r   Úfilter_international_wordsL   s    
z(CharSetProber.filter_international_wordsc                 C   s’   t ƒ }d}d}t| ƒ d¡} t| ƒD ]R\}}|dkrB|d }d}q$|dkr$||krr|sr| | ||… ¡ | d¡ d}q$|sŽ| | |d	… ¡ |S )
a[  
        Returns a copy of ``buf`` that retains only the sequences of English
        alphabet and high byte characters that are not between <> characters.
        This filter can be applied to all scripts which contain both English
        characters and extended ASCII characters, but is currently only used by
        ``Latin1Prober``.
        Fr   Úcó   >r   ó   <r!   TN)r'   Ú
memoryviewÚcastÚ	enumerater*   )r    r,   Úin_tagÚprevÚcurrÚbuf_charr   r   r   Úremove_xml_tagsn   s     	
zCharSetProber.remove_xml_tags)r   Ú
__module__Ú__qualname__ÚSHORTCUT_THRESHOLDr   ÚNONEr   r   Úpropertyr   Ústrr   r   r   Úbytesr'   r   r   r   Úfloatr   Ústaticmethodr$   r0   r;   r   r   r   r   r   (   s"   !r   )r   r"   Útypingr   r   Úenumsr   r   Úcompiler(   r   r   r   r   r   Ú<module>   s   ÿ