U
    -eE                     @   s  d Z ddlZddlZddlZddlZddlZddlmZmZm	Z	 dddddd	d
ddddddddddgZ
G dd deZG dd deZG dd deZG dd deZG dd	 d	eZG dd
 d
eZG dd deZG dd deZG dd deZG dd deZed d!d" ZG d#d deZG d$d deZG d%d deZG d&d deZG d'd deZG d(d deZG d)d deZG d*d deZdS )+a@  
This file implements the building blocks for transforming a collection
of input strings to the desired format in order to calculate the WER of CER.

In principle, for word error rate calculations, every string of a sentence needs to be
collapsed into a list of strings, where each string is a *single* word.
This is done with [transforms.ReduceToListOfListOfWords][].
A composition of multiple transformations must therefore *always* end with
[transforms.ReduceToListOfListOfWords][].

For the character error rate, every string of a sentence also needs to be collapsed into
a list of strings, but here each string is a *single* character.
This is done with [transforms.ReduceToListOfListOfChars][]. Similarly, a
composition of multiple transformations must therefore also always end with
[transforms.ReduceToListOfListOfChars][].
    N)UnionListMappingAbstractTransformComposeExpandCommonEnglishContractionsRemoveEmptyStringsReduceToListOfListOfWordsReduceToListOfListOfCharsReduceToSingleSentenceRemoveKaldiNonWordsRemoveMultipleSpacesRemovePunctuationRemoveSpecificWordsRemoveWhiteSpaceStripSubstituteRegexesSubstituteWordsToLowerCaseToUpperCasec                   @   sJ   e Zd ZdZeeee f dddZedddZee dd	d
Z	dS )r   z(
    The base class of a Transform.
    )	sentencesc                 C   s:   t |tr| |S t |tr(| |S td|dS )z
        Transforms one or more strings.

        Args:
            sentences: The strings to transform.

        Returns:
            (Union[str, List[str]]): The transformed strings.

        z7input {} was expected to be a string or list of stringsN)
isinstancestrprocess_stringlistprocess_list
ValueErrorformat)selfr    r   Q/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/jiwer/transforms.py__call__G   s    



zAbstractTransform.__call__sc                 C   s
   t  d S N)NotImplementedErrorr   r#   r   r   r    r   ]   s    z AbstractTransform.process_stringinpc                    s    fdd|D S )Nc                    s   g | ]}  |qS r   r   .0r#   r   r   r    
<listcomp>a   s     z2AbstractTransform.process_list.<locals>.<listcomp>r   r   r(   r   r,   r    r   `   s    zAbstractTransform.process_listN)
__name__
__module____qualname____doc__r   r   r   r!   r   r   r   r   r   r    r   B   s   c                   @   s*   e Zd ZdZee dddZdd ZdS )r   a  
    Chain multiple transformations back-to-back to create a pipeline combining multiple
    transformations.

    Note that each transformation needs to end with either `ReduceToListOfListOfWords`
    or `ReduceToListOfListOfChars`, depending on whether word error rate,
    or character error rate is desired.

    Example:
        ```python3
        import jiwer

        jiwer.Compose([
            jiwer.RemoveMultipleSpaces(),
            jiwer.ReduceToListOfListOfWords()
        ])
        ```
    
transformsc                 C   s
   || _ dS )zV

        Args:
            transforms: The list of transformations to chain.
        Nr3   )r   r4   r   r   r    __init__x   s    zCompose.__init__c                 C   s   | j D ]}||}q|S r$   r3   )r   texttrr   r   r    r!      s    

zCompose.__call__N)r/   r0   r1   r2   r   r   r5   r!   r   r   r   r    r   d   s   c                   @   s@   e Zd Zdee dddZedddZee dd	d
ZdS )BaseRemoveTransform )tokens_to_removec                 C   s   || _ || _d S r$   )r:   replace_token)r   r:   r;   r   r   r    r5      s    zBaseRemoveTransform.__init__r"   c                 C   s   | j D ]}||| j}q|S r$   )r:   replacer;   )r   r#   wr   r   r    r      s    
z"BaseRemoveTransform.process_stringr'   c                    s    fdd|D S )Nc                    s   g | ]}  |qS r   r)   r*   r,   r   r    r-      s     z4BaseRemoveTransform.process_list.<locals>.<listcomp>r   r.   r   r,   r    r      s    z BaseRemoveTransform.process_listN)r9   )r/   r0   r1   r   r   r5   r   r   r   r   r   r    r8      s   r8   c                   @   s@   e Zd ZdZdedddZedddZee d	d
dZdS )r	   a  
    Transforms a single input sentence, or a list of input sentences, into
    a list with lists of words, which is the expected format for calculating the
    edit operations between two input sentences on a word-level.

    A sentence is assumed to be a string, where words are delimited by a token
    (such as ` `, space). Each string is expected to contain only a single sentence.
    Empty strings (no output) are removed for the list.

    Example:
        ```python
        import jiwer

        sentences = ["hi", "this is an example"]

        print(jiwer.ReduceToListOfListOfWords()(sentences))
        # prints: [['hi'], ['this', 'is', 'an, 'example']]
        ```
     word_delimiterc                 C   s
   || _ dS )zo
        Args:
            word_delimiter: the character which delimits words. Default is ` ` (space).
        Nr?   r   r@   r   r   r    r5      s    z"ReduceToListOfListOfWords.__init__r"   c                 C   s   dd | | jD gS )Nc                 S   s   g | ]}t |d kr|qS    lenr+   r=   r   r   r    r-      s      z<ReduceToListOfListOfWords.process_string.<locals>.<listcomp>)splitr@   r&   r   r   r    r      s    z(ReduceToListOfListOfWords.process_stringr'   c                 C   s<   g }|D ]}|  |d }|| qt|dkr8g gS |S Nr   r   appendrE   r   r(   Zsentence_collectionZsentenceZlist_of_wordsr   r   r    r      s    z&ReduceToListOfListOfWords.process_listN)r>   	r/   r0   r1   r2   r   r5   r   r   r   r   r   r   r    r	      s   c                   @   s0   e Zd ZdZedddZee dddZdS )	r
   a  
    Transforms a single input sentence, or a list of input sentences, into
    a list with lists of characters, which is the expected format for calculating the
    edit operations between two input sentences on a character-level.

    A sentence is assumed to be a string. Each string is expected to contain only a
    single sentence.

    Example:
        ```python
        import jiwer

        sentences = ["hi", "this is an example"]

        print(jiwer.ReduceToListOfListOfChars()(sentences))
        # prints: [['h', 'i'], ['t', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', 'n', ' ', 'e', 'x', 'a', 'm', 'p', 'l', 'e']]
        ```
    r"   c                 C   s   dd |D gS )Nc                 S   s   g | ]}|qS r   r   rF   r   r   r    r-      s     z<ReduceToListOfListOfChars.process_string.<locals>.<listcomp>r   r&   r   r   r    r      s    z(ReduceToListOfListOfChars.process_stringr'   c                 C   s<   g }|D ]}|  |d }|| qt|dkr8g gS |S rH   rI   rK   r   r   r    r      s    z&ReduceToListOfListOfChars.process_listNr/   r0   r1   r2   r   r   r   r   r   r   r   r    r
      s   c                   @   s@   e Zd ZdZdedddZedddZee d	d
dZdS )r   a&  
    Transforms multiple sentences into a single sentence.
    This operation can be useful when the number of reference and hypothesis sentences
    differ, and you want to do a minimal alignment over these lists.
    Note that this creates an invariance: `wer([a, b], [a, b])` might not be equal to
    `wer([b, a], [b, a])`.

    Example:
        ```python3
        import jiwer

        sentences = ["hi", "this is an example"]

        print(jiwer.ReduceToSingleSentence()(sentences))
        # prints: ['hi this is an example']
        ```
    r>   r?   c                 C   s
   || _ dS )zd
        :param word_delimiter: the character which delimits words. Default is ` ` (space).
        Nr?   rA   r   r   r    r5      s    zReduceToSingleSentence.__init__r"   c                 C   s   |S r$   r   r&   r   r   r    r     s    z%ReduceToSingleSentence.process_stringr'   c                 C   s6   dd |D }t |dkrg S d| j|gS d S )Nc                 S   s   g | ]}t |d kr|qS rB   rD   r+   ir   r   r    r-     s      z7ReduceToSingleSentence.process_list.<locals>.<listcomp>r   z{})rE   r   r@   join)r   r(   Zfiltered_inpr   r   r    r     s    z#ReduceToSingleSentence.process_listN)r>   rL   r   r   r   r    r      s   c                   @   s4   e Zd ZdZeeef dddZedddZdS )	r   ag  
    Transform strings by substituting substrings matching regex expressions into
    another substring.

    Example:
        ```python
        import jiwer

        sentences = ["is the world doomed or loved?", "edibles are allegedly cultivated"]

        # note: the regex string "(\w+)ed", matches every word ending in 'ed',
        # and "" stands for the first group ('\w+). It therefore removes 'ed' in every match.
        print(jiwer.SubstituteRegexes({r"doom": r"sacr", r"(\w+)ed": r""})(sentences))

        # prints: ["is the world sacr or lov?", "edibles are allegedly cultivat"]
        ```
    substitutionsc                 C   s
   || _ dS )zj

        Args:
            substitutions: a mapping of regex expressions to replacement strings.
        NrQ   r   rR   r   r   r    r5      s    zSubstituteRegexes.__init__r"   c                 C   s&   | j  D ]\}}t|||}q
|S r$   )rR   itemsresubr   r#   keyvaluer   r   r    r   (  s    z SubstituteRegexes.process_stringNr/   r0   r1   r2   r   r   r5   r   r   r   r   r    r     s   c                   @   s4   e Zd ZdZeeef dddZedddZdS )	r   a|  
    This transform can be used to replace a word into another word.
    Note that the whole word is matched. If the word you're attempting to substitute
    is a substring of another word it will not be affected.
    For example, if you're substituting `foo` into `bar`, the word `foobar` will NOT
    be substituted into `barbar`.

    Example:
        ```python
        import jiwer

        sentences = ["you're pretty", "your book", "foobar"]

        print(jiwer.SubstituteWords({"pretty": "awesome", "you": "i", "'re": " am", 'foo': 'bar'})(sentences))

        # prints: ["i am awesome", "your book", "foobar"]
        ```

    rQ   c                 C   s
   || _ dS )z[
        Args:
            substitutions: A mapping of words to replacement words.
        NrQ   rS   r   r   r    r5   D  s    zSubstituteWords.__init__r"   c                 C   s2   | j  D ]"\}}tdt|||}q
|S )Nz\b{}\b)rR   rT   rU   rV   r   escaperW   r   r   r    r   K  s    zSubstituteWords.process_stringNrZ   r   r   r   r    r   /  s   c                       s*   e Zd ZdZee d fddZ  ZS )r   a%  
    Can be used to filter out certain words.
    As words are replaced with a ` ` character, make sure to that
    `RemoveMultipleSpaces`, `Strip()` and `RemoveEmptyStrings` are present
    in the composition _after_ `RemoveSpecificWords`.

    Example:
        ```python
        import jiwer

        sentences = ["yhe awesome", "the apple is not a pear", "yhe"]

        print(jiwer.RemoveSpecificWords(["yhe", "the", "a"])(sentences))
        # prints: ['  awesome', '  apple is not   pear', ' ']
        # note the extra spaces
        ```
    )words_to_removec                    s   dd |D }t  | dS )zM
        Args:
            words_to_remove: List of words to remove.
        c                 S   s   i | ]
}|d qS )r>   r   )r+   wordr   r   r    
<dictcomp>j  s      z0RemoveSpecificWords.__init__.<locals>.<dictcomp>N)superr5   )r   r\   mapping	__class__r   r    r5   e  s    zRemoveSpecificWords.__init__)r/   r0   r1   r2   r   r   r5   __classcell__r   r   ra   r    r   R  s   c                       s(   e Zd ZdZded fddZ  ZS )r   a|  
    This transform filters out white space characters.
    Note that by default space (` `) is also removed, which will make it impossible to
    split a sentence into a list of words by using `ReduceToListOfListOfWords` or
    `ReduceToSingleSentence`.
    This can be prevented by replacing all whitespace with the space character.
    If so, make sure that `jiwer.RemoveMultipleSpaces`,
    `Strip()` and `RemoveEmptyStrings` are present in the composition _after_
    `RemoveWhiteSpace`.

    Example:
        ```python
        import jiwer

        sentences = ["this is an example", "hello world	"]

        print(jiwer.RemoveWhiteSpace()(sentences))
        # prints: ["thisisanexample", "helloworld"]

        print(jiwer.RemoveWhiteSpace(replace_by_space=True)(sentences))
        # prints: ["this is an example", "hello world  "]
        # note the trailing spaces
        ```
    F)replace_by_spacec                    s2   dd t jD }|rd}nd}t j||d dS )zq

        Args:
            replace_by_space: every white space character is replaced with a space (` `)
        c                 S   s   g | ]}|qS r   r   )r+   cr   r   r    r-     s     z-RemoveWhiteSpace.__init__.<locals>.<listcomp>r>   r9   )r;   N)string
whitespacer_   r5   )r   rd   
charactersr;   ra   r   r    r5     s
    zRemoveWhiteSpace.__init__)F)r/   r0   r1   r2   boolr5   rc   r   r   ra   r    r   o  s   rC   c                  C   s$   t tjd } tdd | D }|S )z9Compute the punctuation characters only once and memoize.rC   c                 s   s*   | ]"}t t|d rt|V  qdS )PN)unicodedatacategorychr
startswithrN   r   r   r    	<genexpr>  s     z._get_punctuation_characters.<locals>.<genexpr>)rangesys
maxunicodeset)Z
codepointspunctuationr   r   r    _get_punctuation_characters  s
    ru   c                       s    e Zd ZdZ fddZ  ZS )r   a  
    This transform filters out punctuation. The punctuation characters are defined as
    all unicode characters whose category name starts with `P`.
    See [here](https://www.unicode.org/reports/tr44/#General_Category_Values) for more
    information.
    Example:
        ```python
        import jiwer

        sentences = ["this is an example!", "hello. goodbye"]

        print(jiwer.RemovePunctuation()(sentences))
        # prints: ['this is an example', "hello goodbye"]
        ```
    c                    s   t  }t | d S r$   )ru   r_   r5   )r   Zpunctuation_charactersra   r   r    r5     s    zRemovePunctuation.__init__)r/   r0   r1   r2   r5   rc   r   r   ra   r    r     s   c                   @   s0   e Zd ZdZedddZee dddZdS )	r   ao  
    Filter out multiple spaces between words.

    Example:
        ```python
        import jiwer

        sentences = ["this is   an   example ", "  hello goodbye  ", "  "]

        print(jiwer.RemoveMultipleSpaces()(sentences))
        # prints: ['this is an example ', " hello goodbye ", " "]
        # note that there are still trailing spaces
        ```

    r"   c                 C   s   t dd|S )Nz\s\s+r>   rU   rV   r&   r   r   r    r     s    z#RemoveMultipleSpaces.process_stringr'   c                    s    fdd|D S )Nc                    s   g | ]}  |qS r   r)   r*   r,   r   r    r-     s     z5RemoveMultipleSpaces.process_list.<locals>.<listcomp>r   r.   r   r,   r    r     s    z!RemoveMultipleSpaces.process_listNrM   r   r   r   r    r     s   c                   @   s   e Zd ZdZedddZdS )r   a~  
    Removes all leading and trailing spaces.

    Example:
        ```python
        import jiwer

        sentences = [" this is an example ", "  hello goodbye  ", "  "]

        print(jiwer.Strip()(sentences))
        # prints: ['this is an example', "hello goodbye", ""]
        # note that there is an empty string left behind which might need to be cleaned up
        ```
    r"   c                 C   s   |  S r$   stripr&   r   r   r    r     s    zStrip.process_stringNr/   r0   r1   r2   r   r   r   r   r   r    r     s   c                   @   s0   e Zd ZdZedddZee dddZdS )	r   a   
    Remove empty strings from a list of strings.

    Example:
        ```python
        import jiwer

        sentences = ["", "this is an example", " ",  "                "]

        print(jiwer.RemoveEmptyStrings()(sentences))
        # prints: ['this is an example']
        ```
    r"   c                 C   s   |  S r$   rw   r&   r   r   r    r     s    z!RemoveEmptyStrings.process_stringr'   c                    s    fdd|D S )Nc                    s   g | ]}  |d kr|qS )r9   r)   r*   r,   r   r    r-     s      z3RemoveEmptyStrings.process_list.<locals>.<listcomp>r   r.   r   r,   r    r     s    zRemoveEmptyStrings.process_listNrM   r   r   r   r    r     s   c                   @   s   e Zd ZdZedddZdS )r   u6  
    Replace common contractions such as `let's` to `let us`.

    Currently, this method will perform the following replacements. Note that `␣` is
     used to indicate a space (` `) to get around markdown rendering constrains.

    | Contraction   | transformed into |
    | ------------- |:----------------:|
    | `won't`       | `␣will not`      |
    | `can't`       | `␣can not`       |
    | `let's`       | `␣let us`        |
    | `n't`         | `␣not`           |
    | `'re`         | `␣are`           |
    | `'s`          | `␣is`            |
    | `'d`          | `␣would`         |
    | `'ll`         | `␣will`          |
    | `'t`          | `␣not`           |
    | `'ve`         | `␣have`          |
    | `'m`          | `␣am`            |

    Example:
        ```python
        import jiwer

        sentences = ["she'll make sure you can't make it", "let's party!"]

        print(jiwer.ExpandCommonEnglishContractions()(sentences))
        # prints: ["she will make sure you can not make it", "let us party!"]
        ```

    r"   c                 C   s   t dd|}t dd|}t dd|}t dd|}t d	d
|}t dd|}t dd|}t dd|}t dd|}t dd|}t dd|}|S )Nzwon'tzwill notzcan\'tzcan notzlet\'szlet uszn\'tz notz\'rez arez\'sz isz\'dz wouldz\'llz willz\'tz\'vez havez\'mz amrv   r&   r   r   r    r     s    z.ExpandCommonEnglishContractions.process_stringNry   r   r   r   r    r     s    c                   @   s   e Zd ZdZedddZdS )r   z
    Convert every character into lowercase.
    Example:
        ```python
        import jiwer

        sentences = ["You're PRETTY"]

        print(jiwer.ToLowerCase()(sentences))

        # prints: ["you're pretty"]
        ```
    r"   c                 C   s   |  S r$   )lowerr&   r   r   r    r   @  s    zToLowerCase.process_stringNry   r   r   r   r    r   1  s   c                   @   s   e Zd ZdZedddZdS )r   z
    Convert every character to uppercase.

    Example:
        ```python
        import jiwer

        sentences = ["You're amazing"]

        print(jiwer.ToUpperCase()(sentences))

        # prints: ["YOU'RE AMAZING"]
        ```
    r"   c                 C   s   |  S r$   )upperr&   r   r   r    r   T  s    zToUpperCase.process_stringNry   r   r   r   r    r   D  s   c                   @   s   e Zd ZdZedddZdS )r   a  
    Remove any word between `[]` and `<>`. This can be useful when working
    with hypotheses from the Kaldi project, which can output non-words such as
    `[laugh]` and `<unk>`.

    Example:
        ```python
        import jiwer

        sentences = ["you <unk> like [laugh]"]

        print(jiwer.RemoveKaldiNonWords()(sentences))

        # prints: ["you  like "]
        # note the extra spaces
        ```
    r"   c                 C   s   t dd|S )Nz[<\[][^>\]]*[>\]]r9   rv   r&   r   r   r    r   k  s    z"RemoveKaldiNonWords.process_stringNry   r   r   r   r    r   X  s   ) r2   rq   	functoolsrU   rf   rk   typingr   r   r   __all__objectr   r   r8   r	   r
   r   r   r   r   r   	lru_cacheru   r   r   r   r   r   r   r   r   r   r   r   r    <module>   sX   "#-%%"#*
	6