U
    9%e-+                  	   @   sb  d Z ddlZddlmZ ddlmZmZ ddlmZm	Z	 ddl
Z
ddlZddlmZ ddlmZ ddlmZ dd	lmZmZ d
dlmZ d
dlmZmZmZmZ d
dlmZ eddddeddddeddddeddddeddddfZ eddd dZ!e"e#Z$ee%dged!d"d#hgd$gd%gd$gd$gd&d'd(dd#d'dd)d)d&d*d+Z&d,d- Z'd.d/ Z(dS )0zhRCV1 dataset.

The dataset page is available at

    http://jmlr.csail.mit.edu/papers/volume5/lewis04a/
    N)GzipFile)makedirsremove)existsjoin   )Bunch)shuffle)
StrOptionsvalidate_params   )get_data_home)RemoteFileMetadata_fetch_remote_pkl_filepath
load_descr)load_svmlight_filesz.https://ndownloader.figshare.com/files/5976069Z@ed40f7e418d10484091b059703eeb95ae3199fe042891dcec4be6696b9968374z lyrl2004_vectors_test_pt0.dat.gz)urlZchecksumfilenamez.https://ndownloader.figshare.com/files/5976066Z@87700668ae45d45d5ca1ef6ae9bd81ab0f5ec88cc95dcef9ae7838f727a13aa6z lyrl2004_vectors_test_pt1.dat.gzz.https://ndownloader.figshare.com/files/5976063Z@48143ac703cbe33299f7ae9f4995db49a258690f60e5debbff8995c34841c7f5z lyrl2004_vectors_test_pt2.dat.gzz.https://ndownloader.figshare.com/files/5976060Z@dfcb0d658311481523c6e6ca0c3f5a3e1d3d12cde5d7a8ce629a9006ec7dbb39z lyrl2004_vectors_test_pt3.dat.gzz.https://ndownloader.figshare.com/files/5976057Z@5468f656d0ba7a83afc7ad44841cf9a53048a5c083eedc005dcdb5cc768924aezlyrl2004_vectors_train.dat.gzz.https://ndownloader.figshare.com/files/5976048Z@2a98e5e5d8b770bded93afc8930d88299474317fe14181aee1466cc754d0d1c1zrcv1v2.topics.qrels.gztraintestallbooleanrandom_state)	data_homesubsetdownload_if_missingr   r	   
return_X_yT)Zprefer_skip_nested_validationFc           '   	   C   s  d}d}d}d}	t | d} t| d}
|r8t|
s8t|
 t|
d}t|
d}t|
d	}t|
d
}|rbt|rxt|sbg }tD ]0}td|j  t	||
d}|
t|d qt||d}t|d |d |d |d |d g }t|d |d |d |d |d f}|jtjdd}tj||dd tj||dd |D ]}|  t|j qFnt|}t|}|rt|rt|stdtj  t	t|
d}d}d}d}tj||ftjd}tj|tjd}i }t|dd}|D ]~}|d d!}t |dkr|\}} }!||kr8|d7 }|||< t!| } | |kr^| }|d7 }| ||< d|||| f< qW 5 Q R X t| t"||}"||"d"d"f }tj#|t$d}#|% D ]}$|$|#||$ < qt&|#}%|#|% }#t'|d"d"|%f }tj||dd tj|#|dd nt|}t|}#|d#kr2n|d$krr|d"|	d"d"f }|d"|	d"d"f }|d"|	 }nL|d%kr||	d"d"d"f }||	d"d"d"f }||	d" }nt(d&| |rt)||||d'\}}}t*d(}&|r||fS t+||||#|&d)S )*a|  Load the RCV1 multilabel dataset (classification).

    Download it if necessary.

    Version: RCV1-v2, vectors, full sets, topics multilabels.

    =================   =====================
    Classes                               103
    Samples total                      804414
    Dimensionality                      47236
    Features            real, between 0 and 1
    =================   =====================

    Read more in the :ref:`User Guide <rcv1_dataset>`.

    .. versionadded:: 0.17

    Parameters
    ----------
    data_home : str, default=None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    subset : {'train', 'test', 'all'}, default='all'
        Select the dataset to load: 'train' for the training set
        (23149 samples), 'test' for the test set (781265 samples),
        'all' for both, with the training samples first if shuffle is False.
        This follows the official LYRL2004 chronological split.

    download_if_missing : bool, default=True
        If False, raise an OSError if the data is not locally available
        instead of trying to download the data from the source site.

    random_state : int, RandomState instance or None, default=None
        Determines random number generation for dataset shuffling. Pass an int
        for reproducible output across multiple function calls.
        See :term:`Glossary <random_state>`.

    shuffle : bool, default=False
        Whether to shuffle dataset.

    return_X_y : bool, default=False
        If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch
        object. See below for more information about the `dataset.data` and
        `dataset.target` object.

        .. versionadded:: 0.20

    Returns
    -------
    dataset : :class:`~sklearn.utils.Bunch`
        Dictionary-like object. Returned only if `return_X_y` is False.
        `dataset` has the following attributes:

        - data : sparse matrix of shape (804414, 47236), dtype=np.float64
            The array has 0.16% of non zero values. Will be of CSR format.
        - target : sparse matrix of shape (804414, 103), dtype=np.uint8
            Each sample has a value of 1 in its categories, and 0 in others.
            The array has 3.15% of non zero values. Will be of CSR format.
        - sample_id : ndarray of shape (804414,), dtype=np.uint32,
            Identification number of each sample, as ordered in dataset.data.
        - target_names : ndarray of shape (103,), dtype=object
            Names of each target (RCV1 topics), as ordered in dataset.target.
        - DESCR : str
            Description of the RCV1 dataset.

    (data, target) : tuple
        A tuple consisting of `dataset.data` and `dataset.target`, as
        described above. Returned only if `return_X_y` is True.

        .. versionadded:: 0.20
    i>F i  g   imZ  )r   ZRCV1zsamples.pklzsample_id.pklzsample_topics.pklztopics_names.pklzDownloading %s)dirname)r   )Z
n_features   r   r         	   r            F)copy)compressZdtyperb)r   modeascii Nr   r   r   zLUnknown subset parameter. Got '%s' instead of one of ('all', 'train', test'))r   zrcv1.rst)datatarget	sample_idZtarget_namesZDESCR),r   r   r   r   r   XY_METADATAloggerinfor   r   appendr   r   spZvstackZtocsrnpZhstackZastypeZuint32joblibdumpcloser   nameloadTOPICS_METADATAzerosZuint8int32decodesplitlenint_find_permutationemptyobjectkeysargsortZ
csr_matrix
ValueErrorshuffle_r   r   )'r   r   r   r   r	   r   Z	N_SAMPLESZ
N_FEATURESZN_CATEGORIESZN_TRAINZrcv1_dirZsamples_pathZsample_id_pathZsample_topics_pathZtopics_pathfilesZeach	file_pathZXyXr1   fZtopics_archive_pathZn_catZn_docZdoc_previousyZsample_id_bisZcategory_nameslineZline_componentscatdoc_Zpermutation
categorieskorderZfdescr rW   U/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/sklearn/datasets/_rcv1.py
fetch_rcv1K   s    \





,(











    rY   c                 C   s8   | j }tj|tjd}tj|tjd}t|| | |S )zInverse permutation p.r*   )sizer7   r>   r?   Zarangeput)pnsirW   rW   rX   _inverse_permutation$  s
    r`   c                 C   s$   t | }t |}t|}|| S )z!Find the permutation from a to b.)r7   rH   r`   )abtuZu_rW   rW   rX   rD   -  s    

rD   ))__doc__logginggzipr   osr   r   os.pathr   r   r8   numpyr7   Zscipy.sparsesparser6   utilsr   r	   rJ   Zutils._param_validationr
   r    r   _baser   r   r   r   Z_svmlight_format_ior   r2   r=   	getLogger__name__r3   strrY   r`   rD   rW   rW   rW   rX   <module>   s   

 O	