U
    -eBZ                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlm  mZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZm Z  d dl!m"Z" ee#Z$dZ%dd Z&G dd deZ'edG dd deZ(dS )    N)ArgumentParser)Path)Optional)config)BaseDatasetsCLICommand)DownloadConfig)DownloadManager)MockDownloadManager)dataset_module_factoryimport_main_class)
deprecated)
get_loggerset_verbosity_warning)
map_nestedzutf-8c              
   C   s*   t | j| j| j| j| j| j| j| j| j		S N)
DummyDataCommandpath_to_datasetauto_generaten_lines
json_fieldxml_tagmatch_text_fileskeep_uncompressed	cache_direncoding)args r   ]/var/www/html/Darija-Ai-Train/env/lib/python3.8/site-packages/datasets/commands/dummy_data.pydummy_data_command_factory   s    r   c                
       s   e Zd Z fddZ fddZ fddZdeee ee ee ee e	d	d
dZ
deeeee ee ee ee edddZedefddZdd Z  ZS )!DummyDataGeneratorDownloadManagerc                    s$   t  j|| || _g | _g | _d S r   )super__init__mock_download_managerdownloaded_dummy_pathsexpected_dummy_paths)selfr"   r   kwargs	__class__r   r   r!   *   s    z*DummyDataGeneratorDownloadManager.__init__c                    s@   t  |}| j|}t| jj|dd t| jj|dd |S NT)Z	map_tuple)r    downloadr"   r   r#   appendr$   r%   Zurl_or_urlsoutputZdummy_outputr'   r   r   r*   0   s
    z*DummyDataGeneratorDownloadManager.downloadc                    sH   t  t  |}| j|}t| jj|dd t| jj|dd |S r)   )r    extractr*   r"   r   r#   r+   r$   r,   r'   r   r   download_and_extract7   s
    z6DummyDataGeneratorDownloadManager.download_and_extract   N)r   r   r   r   r   returnc           
      C   s   t jt j| jj| jj| jjddd d}d| j_t	| j
| jD ]B\}}t j| jj| jj| jj|}	|| j||	|||||d7 }qB|dkrtd |dkS )N
dummy_dataTexist_okr   Fr   r   r   r   r   zDummy data generation failed: no dummy files were created. Make sure the data files format is supported by the auto-generation.)osmakedirspathjoinr"   datasets_scripts_dirdataset_namedummy_data_folderload_existing_dummy_datazipr#   r$   _create_dummy_dataloggererror)
r%   r   r   r   r   r   totalsrc_pathZrelative_dst_pathdst_pathr   r   r   auto_generate_dummy_data_folder>   s@    	
	zADummyDataGeneratorDownloadManager.auto_generate_dummy_data_folder)rC   rD   r   r   r   r   r   r1   c                    s  |pt }tj|r`td|  t|j ddddg}t fdd|D }	|d k	rtj	|}
|
dD ]}|	t|
|O }	qp|	rt|jjd	d	d
 t||d`}t|d|dF}g }t|D ]\}}|kr q|| q|d|  W 5 Q R X W 5 Q R X dS d krt||d}t|}|d k	rJ|| }t|trtdd | D stdt|  dfdd| D }n|d  }|d k	r||i}t|jjd	d	d
 t|d|d}t|| W 5 Q R X W 5 Q R X dS t fdddD rJ|d kr2td n| j ||||d dS td| d dS tj!|rd}t"|D ]f\}}}|D ]T}|#dstj||}tj|t|$|}|| j%||||||d7 }qq||S d S )Nz#Trying to generate dummy data file z.txtz.csvz.jsonlz.tsvc                 3   s   | ]}| kV  qd S r   r   .0	extensiondst_path_extensionsr   r   	<genexpr>w   s     zGDummyDataGeneratorDownloadManager._create_dummy_data.<locals>.<genexpr>,Tr4   parentsr   w    z.jsonc                 s   s   | ]}t |tV  qd S r   )
isinstancelist)rG   vr   r   r   rK      s     zCouldn't parse columns z\. Maybe specify which json field must be used to read the data with --json_field <my_field>.c                    s   i | ]\}}||d   qS r   r   )rG   krU   )r   r   r   
<dictcomp>   s      zHDummyDataGeneratorDownloadManager._create_dummy_data.<locals>.<dictcomp>c                 3   s   | ]}| kV  qd S r   r   rF   rI   r   r   rK      s     )z.xmlz.txmzEFound xml file but 'xml_tag' is set to None. Please provide --xml_tag)r   r   zCouldn't generate dummy file 'z9'. Ignore that if this file is not useful for dummy data.r   .r5   )&DEFAULT_ENCODINGr6   r8   isfiler@   debugr   suffixesanybasenamesplitfnmatchparentmkdiropen	enumerater+   writer9   stripjsonloadrS   dictallvalues
ValueErrorrT   keysitemsdumpwarning_create_xml_dummy_dataisdirwalk
startswithrelative_tor?   )r%   rC   rD   r   r   r   r   r   Zline_by_line_extensionsZis_line_by_line_text_file	file_namepatternsrc_fileZdst_fileZfirst_linesilineZ	json_dataZfirst_json_datarB   r8   _filesnameZsrc_file_pathZdst_file_pathr   )rJ   r   r   r?   h   s    

(



 

	z4DummyDataGeneratorDownloadManager._create_dummy_datac              	   C   s   t |jjddd t| |d}d}g }tj|ddD ]R\}}	|dkrT||	 q8| }
|	j|kr8||k rx|d7 }q8|r8|d	 	|	 q8tj
|	d
j||d W 5 Q R X d S )NTrM   rO   r   )startend)eventsr~   rR   )element)r   ra   rb   rc   ETZ	iterparser+   poptagremoveElementTreere   )rC   rD   r   r   r   rx   Zn_linerN   eventelemr{   r   r   r   rq      s    

z8DummyDataGeneratorDownloadManager._create_xml_dummy_datac                 C   sT   t j|| jj}t j|d}d}td| d t|d|| t	| d S )Nr2   z"Compressing dummy data folder to 'z.zip'r>   )
r6   r8   r9   r"   r<   r@   infoshutilmake_archivermtree)r%   r   root_dir	base_namebase_dirr   r   r   !compress_autogenerated_dummy_data   s    zCDummyDataGeneratorDownloadManager.compress_autogenerated_dummy_data)r0   NNNN)NNNN)__name__
__module____qualname__r!   r*   r/   intr   strboolrE   r?   staticmethodrY   rq   r   __classcell__r   r   r'   r   r   )   sB   	     /    Tr   zThe `datasets` repository does not host the dataset scripts anymore. Therefore, dummy data is no longer needed to test their loading with CI.c                
   @   sr   e Zd ZeedddZeeee	e e	e e	e ee	e e	e d	ddZ
dd Ze	e d	d
dZdd ZdS )r   )parserc                 C   s   | j ddd}|jdddd |jdtd	d
d |jdtd dd |jdtd dd |jdtd dd |jdddd |jdtd dd |jdtd dt d |jdtdd |jtd d S )Nr2   zGenerate dummy data.)helpz--auto_generate
store_truez!Automatically generate dummy data)actionr   z	--n_linesr0   zBNumber of lines or samples to keep when auto-generating dummy data)typedefaultr   z--json_fieldzOptional, json field to read the data from when auto-generating dummy data. In the json data files, this field must point to a list of samples as json objects (ex: the 'data' field for squad-like files)z	--xml_tagz[Optional, xml tag name of the samples inside the xml files when auto-generating dummy data.z--match_text_fileszOptional, a comma separated list of file patterns that looks for line-by-line text files other than *.txt or *.csv. Example: --match_text_files *.labelz--keep_uncompressedzWhether to leave the dummy data folders uncompressed when auto-generating dummy data. Useful for debugging for to do manual adjustements before compressing.z--cache_dirzKCache directory to download and cache files when auto-generating dummy dataz
--encodingz=Encoding to use when auto-generating dummy data. Defaults to r   z/Path to the dataset (example: ./datasets/squad))r   r   )func)
add_parseradd_argumentr   r   rY   set_defaultsr   )r   Ztest_parserr   r   r   register_subcommand   sZ       z$DummyDataCommand.register_subcommand)	r   r   r   r   r   r   r   r   r   c
           
      C   s   || _ tj|r.|tjddd | _n|tjddd | _tj|pVt	j
}|| _|| _|| _|| _|| _|| _|| _|	| _d S )N/r   )_path_to_datasetr6   r8   rr   replacesepr_   _dataset_name
expanduserr   ZHF_DATASETS_CACHE_auto_generate_n_lines_json_field_xml_tag_match_text_files_keep_uncompressed
_cache_dir	_encoding)
r%   r   r   r   r   r   r   r   r   r   r   r   r   r!   	  s    zDummyDataCommand.__init__c              	   C   s   t   t| j}t|j}|jp$d g}g }t }|D ]v}|rF|jnd }|||j	|d}|rd|j
n|jj
}	t| j||	ddd}
| jr|| j||
| jd q8| j||
d q8| jr| jst|rtd| j d ntd	| j d W 5 Q R X d S )
N)config_namehashr   TF)r;   r   versionZuse_local_dummy_datar=   )dataset_buildermock_dl_managerr   )r   r   z>Automatic dummy data generation succeeded for all configs of ''z<Automatic dummy data generation failed for some configs of ')r   r
   r   r   module_pathZBUILDER_CONFIGStempfileTemporaryDirectoryr}   r   r   r   r	   r   r   r+   _autogenerate_dummy_datar   _print_dummy_data_instructionsrj   print)r%   Zdataset_moduleZbuilder_clsZbuilder_configsZauto_generate_resultsZtmp_dirZbuilder_configr   r   r   r   r   r   r   run$  sB    


 zDummyDataCommand.run)r1   c              
      s  | j rtj| j tjntj}t|d}t| j	||d}|
| d|_|j| j| j| j| j| jd |stj|j|j}|| d|_i  tj|j dd z4|
|}|D ] }	|j|	dd |	jj |	j< qW nH tk
r }
 z(td|jj d	t|
  W Y dS d }
~
X Y nfX td
d   D rRt d|jj d dS  fdd D }t d| d|jj d dS n$tj| j!|j"}t#d| d d S )N)r   )r;   r"   download_configFr5   Tr3   )Zcheck_duplicate_keysz&Failed to load dummy data for config 'z''.
Original error:
c                 s   s   | ]}|d kV  qdS )r   Nr   )rG   Z
n_examplesr   r   r   rK   q  s     z<DummyDataCommand._autogenerate_dummy_data.<locals>.<genexpr>zEDummy data generation done and dummy data test succeeded for config 'z''.c                    s   g | ]} | d kr|qS )r   r   )rG   Z
split_nameZn_examples_per_splitr   r   
<listcomp>w  s     z=DummyDataCommand._autogenerate_dummy_data.<locals>.<listcomp>zCDummy data generation done but dummy data test failed since splits z have 0 examples for config 'z#Dummy data generated in directory 'zg' but kept uncompressed. Please compress this directory into a zip file to use it for dummy data tests.)$r   r6   r8   r9   r   ZDOWNLOADED_DATASETS_DIRZDOWNLOADED_DATASETS_PATHr   r   r   _split_generatorsr=   rE   r   r   r   r   r   r:   r;   r   r7   Z_prepare_splitZ
split_infoZnum_examplesr}   OSErrorr@   rA   r   rj   rk   rp   r   r<   r   )r%   r   r   r   Zdl_cache_dirr   Z
dl_managerZpath_do_datasetZsplit_generatorsZsplit_generatoreZempty_splitsZgenerated_dummy_data_dirr   r   r   r   K  sj    
  




z)DummyDataCommand._autogenerate_dummy_datac                 C   s  t j| j|j}td| d t j|dd z||}W nD t	k
r } z&t
d| j d|j d|j d W 5 d }~X Y nX t }g }|j}|D ]}	td	|	j  ||	j |	j}
|jf |
}zjd
}|jd k	rd|jj dnd}|d| | j d| d| d 7 }|D ]
\}}q|d| d7 }W q t	k
rh } z||j W 5 d }~X Y qX qd|}t|dkrt|dkrtt||kr|dtt| d| d| d7 }|}n0d|}|d| d| d7 }|d| d7 }|d| d 7 }t|dkr~tt||kr~|d!| d"| d#| d$7 }|d%| d&| d$7 }|d'| d(| d)| d*7 }nN|d+| d,| d#| d-7 }|d.| d/| d$7 }|d0| d(| d)| d*7 }|d1| d2| d37 }|d47 }t
| d S )5Nz$Creating dummy folder structure for z... Tr3   zDataset z with config a   seems to already open files in the method `_split_generators(...)`. You might consider to instead only open files in the method `_generate_examples(...)` instead. If this is not possible the dummy data has to be created with less guidance. Make sure you create the file rX   z/Collecting dummy data file paths to create for zU
==============================DUMMY DATA INSTRUCTIONS==============================
zconfig z of rQ   z(- In order to create the dummy data for z, please go into the folder 'z' with `cd z` . 

za- It appears that the function `_generate_examples(...)` expects one or more files in the folder z using the function `glob.glob(...)`. In this case, please refer to the `_generate_examples(...)` method to see under which filename the dummy data files should be created. 

z, r   rR   z1- Please create a single dummy data file called 'z' from the folder 'zV'. Make sure that the dummy data file provides at least one example for the split(s) 'z' 

z0- Please create the following dummy data files 'z'

z- For each of the splits 'zU', make sure that one or more of the dummy data files provide at least one example 

z- If the method `_generate_examples(...)` includes multiple `open()` statements, you might have to create other files in addition to 'zG'. In this case please refer to the `_generate_examples(...)` method 

z@- After the dummy data file is created, it should be zipped to 'z.zip' with the command `zip z.zip z` 

z- You can now delete the file 'z' with the command `rm z- To get the file 'z;' back for further changes to the dummy data, simply unzip z.zip with the command `unzip z.zip` 

zP- After all dummy data files are created, they should be zipped recursively to 'z.zip' with the command `zip -r z/` 

z!- You can now delete the folder 'z' with the command `rm -r z- To get the folder 'z'- Make sure you have created the file 'z
.zip' in 'z' 
zT===================================================================================
)r6   r8   r9   r   r<   r@   r   r7   r   FileNotFoundErrorr   r   r   filenamesetdummy_file_namer}   r+   
gen_kwargsZ_generate_examplesaddlennextiter)r%   r   r   r<   Zgenerator_splitsr   Zfiles_to_createZsplit_namesr   r_   r   	generatorZdummy_data_guidance_printZconfig_stringkeyrecordZfiles_stringr   r   r   r     st     
 $
 z/DummyDataCommand._print_dummy_data_instructionsN)r   r   r   r   r   r   r   r   r   r   r!   r   r   r   r   r   r   r   r      s   .':r   ))r`   rg   r6   r   r   Zxml.etree.ElementTreeetreer   r   argparser   pathlibr   typingr   Zdatasetsr   Zdatasets.commandsr   Z!datasets.download.download_configr   Z"datasets.download.download_managerr   Z'datasets.download.mock_download_managerr	   Zdatasets.loadr
   r   Z datasets.utils.deprecation_utilsr   Zdatasets.utils.loggingr   r   Zdatasets.utils.py_utilsr   r   r@   rY   r   r   r   r   r   r   r   <module>   s4    0