U
    9%e2J                     @   s   d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z
ddlZddlmZ ddlmZ dd	lmZmZmZ eeed
ddZeee
jd
ddZdd ZdddZdddZdddZdS )z9Implementation of ARFF parsers: via LIAC-ARFF and pandas.    N)OrderedDict)	Generator)List   )_arff)ArffSparseDataType)_chunk_generatorcheck_pandas_supportget_chunk_n_rows)	arff_datainclude_columnsreturnc                 C   s   t  t  t  f}dd t|D }t| d | d | d D ]@\}}}||kr:|d | |d | |d ||  q:|S )a  Obtains several columns from sparse ARFF representation. Additionally,
    the column indices are re-labelled, given the columns that are not
    included. (e.g., when including [1, 2, 3], the columns will be relabelled
    to [0, 1, 2]).

    Parameters
    ----------
    arff_data : tuple
        A tuple of three lists of equal size; first list indicating the value,
        second the x coordinate and the third the y coordinate.

    include_columns : list
        A list of columns to include.

    Returns
    -------
    arff_data_new : tuple
        Subset of arff data with only the include columns indicated by the
        include_columns argument.
    c                 S   s   i | ]\}}||qS  r   .0Z	array_idxZ
column_idxr   r   \/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/sklearn/datasets/_arff_parser.py
<dictcomp>,   s     z)_split_sparse_columns.<locals>.<dictcomp>r      r   )list	enumeratezipappend)r   r   Zarff_data_newreindexed_columnsvalrow_idxcol_idxr   r   r   _split_sparse_columns   s    "r   c           	      C   s~   t | d d }|t|f}dd t|D }tj|tjd}t| d | d | d D ]"\}}}||krV||||| f< qV|S )Nr   c                 S   s   i | ]\}}||qS r   r   r   r   r   r   r   >   s     z)_sparse_data_to_array.<locals>.<dictcomp>dtyper   r   )maxlenr   npemptyfloat64r   )	r   r   num_obsZy_shaper   yr   r   r   r   r   r   _sparse_data_to_array7   s    "r&   c                 C   sD   | | }t |dkr| | }nt |dkr8| |d  }nd}||fS )a  Post process a dataframe to select the desired columns in `X` and `y`.

    Parameters
    ----------
    frame : dataframe
        The dataframe to split into `X` and `y`.

    feature_names : list of str
        The list of feature names to populate `X`.

    target_names : list of str
        The list of target names to populate `y`.

    Returns
    -------
    X : dataframe
        The dataframe containing the features.

    y : {series, dataframe} or None
        The series or dataframe containing the target.
    r   r   r   N)r    )frameZfeature_namesZtarget_namesXr%   r   r   r   _post_process_frameI   s    
r)   c           "         s  dd }|| }|dkrt jnt j}|dk }	t j|||	d}
|| fdd|
d D  |dkrtd	}t|
d }t| }t|
d
 }|j	|g|dd}|j
dd }t|}fdd|D }|| g}t|
d
 |D ]}||j	||dd|  qt|dkr,|d |d j|d< |j|ddjtjd}~~i }|jD ]P}| d }| dkrzd||< n&| dkrd||< n|j| ||< qR||}t|||\}n|
d
 }fdd|D }fdd|D }t|trt|dkrtd|d dkr d}n|d |d  }tjtj|d |d!}|j | }|dd|f }|dd|f nt|t!rt"||}t#|d d }|t|f} t$j%j&|d |d |d ff| tj'd"}|( }t)||ntd#t*|  fd$d%|D }!|!sn<t+|!r8t, fd&dt-|D nt.|!rJtd'j/d dkrf d(nj/d dkrzd|dkr||dfS |d fS ))a  ARFF parser using the LIAC-ARFF library coded purely in Python.

    This parser is quite slow but consumes a generator. Currently it is needed
    to parse sparse datasets. For dense datasets, it is recommended to instead
    use the pandas-based parser, although it does not always handles the
    dtypes exactly the same.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The file compressed to be read.

    output_arrays_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities ara:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected.

    target_names_to_select : list of str
        A list of the target names to be selected.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    c                 s   s   | D ]}| dV  qd S )Nutf-8)decode)	gzip_fileliner   r   r   _io_to_generator   s    z+_liac_arff_parser.<locals>._io_to_generatorsparsepandas)return_typeencode_nominalc                    s(   i | ] \}}t |tr| kr||qS r   )
isinstancer   )r   namecatcolumns_to_selectr   r   r      s
   
  z%_liac_arff_parser.<locals>.<dictcomp>
attributeszfetch_openml with as_frame=TruedataF)columnscopyT)deepc                    s   g | ]}| kr|qS r   r   r   colr6   r   r   
<listcomp>   s      z%_liac_arff_parser.<locals>.<listcomp>r   r   r   )Zignore_index)value	data_typeintegerInt64nominalcategoryc                    s   g | ]}t  | d  qS indexintr   col_nameopenml_columns_infor   r   r?      s   c                    s   g | ]}t  | d  qS rF   rH   rJ   rL   r   r   r?      s   Nz6shape must be provided when arr['data'] is a Generatorr#   )r   count)shaper   z-Unexpected type for data obtained from arff: c                    s   h | ]}| kqS r   r   rJ   )
categoriesr   r   	<setcomp>  s    z$_liac_arff_parser.<locals>.<setcomp>c              
      sJ   g | ]B\}}t t j |d ddd||d f jtddqS )Or   Nr   F)r;   )r!   ZtakeZasarraypopastyperI   )r   irK   )rQ   r%   r   r   r?     s
    zAMix of nominal and non-nominal targets is not currently supported)rN   )0r   ZCOOZ	DENSE_GENloadr	   r   r   keysnextZ	DataFrameZmemory_usagesumr
   r   r   r    rU   dtypesconcatZfillnar!   nanr:   lowerr)   r3   r   
ValueErrorZfromiter	itertoolschainfrom_iterableZreshapetupler   r   spr/   Z
coo_matrixr#   Ztocsrr&   typeallZhstackr   anyrP   )"r,   output_arrays_typerM   feature_names_to_selecttarget_names_to_selectrP   r.   streamr1   r2   Zarff_containerpdZcolumns_infoZcolumns_names	first_rowZfirst_dfZ	row_bytes	chunksizecolumns_to_keepdfsr9   r'   r[   r4   column_dtyper(   r   Zfeature_indices_to_selectZtarget_indices_to_selectrO   Zarff_data_Xr$   ZX_shapeZis_classificationr   )rQ   r7   rM   r%   r   _liac_arff_parseri   s    7
  






  







	

rr   c              
      s  ddl | D ]}|d dr q*qi |D ]:}|| d }| dkrXd|< q2| dkr2d	|< q2fd
dt|D }	dddgddddd|	d	}
|
|pi }j| f|}zdd |D |_W n0 tk
r } zj	d|W 5 d}~X Y nX ||   fdd|jD }|| }t
dfdd}fdd|j D }|D ]}|| j|||< qTt|||\}}|dkr|||dfS | |  }}fdd|j D }||d|fS )a^  ARFF parser using `pandas.read_csv`.

    This parser uses the metadata fetched directly from OpenML and skips the metadata
    headers of ARFF file itself. The data is loaded as a CSV file.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The GZip compressed file with the ARFF formatted payload.

    output_arrays_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities are:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    openml_columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected to build `X`.

    target_names_to_select : list of str
        A list of the target names to be selected to build `y`.

    read_csv_kwargs : dict, default=None
        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
        the default options.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    r   Nr*   z@datarA   rB   rC   rD   rE   c                    s"   i | ]\}}| kr| | qS r   r   )r   r   r4   )r[   r   r   r   ~  s    z'_pandas_arff_parser.<locals>.<dictcomp>F?%"T\)	headerZ	index_colZ	na_valuesZkeep_default_nacomment	quotecharskipinitialspace
escapecharr   c                 S   s   g | ]}|qS r   r   )r   r4   r   r   r   r?     s     z'_pandas_arff_parser.<locals>.<listcomp>zwThe number of columns provided by OpenML does not match the number of columns inferred by pandas when reading the file.c                    s   g | ]}| kr|qS r   r   r=   r6   r   r   r?     s      z^'(?P<contents>.*)'$c                    s"   t  | }|d kr| S |dS )Ncontents)researchgroup)Zinput_stringmatch)single_quote_patternr   r   strip_single_quotes  s    z0_pandas_arff_parser.<locals>.strip_single_quotesc                    s    g | ]\}}t | jr|qS r   )r3   CategoricalDtyper   r4   r   rl   r   r   r?     s   r0   c                    s(   i | ] \}}t | jr||j qS r   )r3   r   rQ   tolistr   r   r   r   r     s    )r0   r+   r^   
startswithr   Zread_csvr:   r_   errorsZParserErrorr}   compiler[   itemsr5   Zrename_categoriesr)   Zto_numpy)r,   rh   rM   ri   rj   read_csv_kwargsr-   r4   rq   Zdtypes_positionalZdefault_read_csv_kwargsr'   excro   r   Zcategorical_columnsr>   r(   r%   rQ   r   )r7   r[   rl   r   r   _pandas_arff_parser4  sh    8






r   c                 C   sH   |dkrt | |||||S |dkr4t| |||||S td| ddS )a6  Load a compressed ARFF file using a given parser.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The file compressed to be read.

    parser : {"pandas", "liac-arff"}
        The parser used to parse the ARFF file. "pandas" is recommended
        but only supports loading dense datasets.

    output_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities ara:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    openml_columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected.

    target_names_to_select : list of str
        A list of the target names to be selected.

    read_csv_kwargs : dict, default=None
        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
        the default options.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    z	liac-arffr0   zUnknown parser: 'z%'. Should be 'liac-arff' or 'pandas'.N)rr   r   r_   )r,   parseroutput_typerM   ri   rj   rP   r   r   r   r   load_arff_from_gzip_file  s*    ;	
r   )N)N)NN)__doc__r`   r}   collectionsr   collections.abcr   typingr   numpyr!   Zscipyrd   Z	externalsr   Zexternals._arffr   utilsr   r	   r
   r   Zndarrayr&   r)   rr   r   r   r   r   r   r   <module>   s8    $ & 
 R 
    