U
    sVct                     @  s  d Z ddlmZ ddlmZ ddlmZmZ ddlZddlm	Z	 ddl
ZddlmZmZmZ ddlmZmZ ddlZdd	lmZmZ dd
lmZ ddlmZ ddlm  m  mZ ddl m!Z! dddddZ"ddddddZ#G dd dZ$G dd dZ%G dd de!ej&Z'dS )a  
Read SAS7BDAT files

Based on code written by Jared Hobbs:
  https://bitbucket.org/jaredhobbs/sas7bdat

See also:
  https://github.com/BioStatMatt/sas7bdat

Partial documentation of the file format:
  https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf

Reference for binary data compression:
  http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
    )annotations)abc)datetime	timedeltaN)cast)CompressionOptionsFilePath
ReadBuffer)EmptyDataErrorOutOfBoundsDatetime)	DataFrameisna)
get_handle)Parser)
ReaderBasefloatstrZsas_datetimeunitc                 C  sV   t | rtjS |dkr,tdddt| d S |dkrJtdddt| d S tdd S )Ns     )secondsd)dayszunit must be 'd' or 's')r   pdZNaTr   r   
ValueErrorr    r   :/tmp/pip-unpacked-wheel-xj8nt62q/pandas/io/sas/sas7bdat.py_parse_datetime2   s    r   z	pd.Series)sas_datetimesr   returnc                 C  sJ   zt j| |ddW S  tk
rD   | jt|d}tt j|}| Y S X dS )a  
    Convert to Timestamp if possible, otherwise to datetime.datetime.
    SAS float64 lacks precision for more than ms resolution so the fit
    to datetime.datetime is ok.

    Parameters
    ----------
    sas_datetimes : {Series, Sequence[float]}
       Dates or datetimes in SAS
    unit : {str}
       "d" if the floats represent dates, "s" for datetimes

    Returns
    -------
    Series
       Series of datetime64 dtype or datetime.datetime.
    z
1960-01-01)r   originr   N)r   Zto_datetimer   applyr   r   Series)r    r   Zs_seriesr   r   r   _convert_datetimes@   s    r&   c                   @  sD   e Zd ZU ded< ded< ded< ded< dddddddd	Zd
S )_SubheaderPointerintoffsetlengthcompressionptypeNone)r)   r*   r+   r,   r!   c                 C  s   || _ || _|| _|| _d S N)r)   r*   r+   r,   )selfr)   r*   r+   r,   r   r   r   __init__`   s    z_SubheaderPointer.__init__N__name__
__module____qualname____annotations__r0   r   r   r   r   r'   Z   s
   
r'   c                   @  sX   e Zd ZU ded< ded< ded< ded< ded< ded	< ddddddd
dddZdS )_Columnr(   col_idstr | bytesnamelabelformatbytesctyper*   r-   )r7   r9   r:   r;   r=   r*   r!   c                 C  s(   || _ || _|| _|| _|| _|| _d S r.   )r7   r9   r:   r;   r=   r*   )r/   r7   r9   r:   r;   r=   r*   r   r   r   r0   o   s    
z_Column.__init__Nr1   r   r   r   r   r6   g   s   
r6   c                   @  s   e Zd ZU dZded< ded< d`d	d
d
ddd
d
ddd	ddZddddZddddZddddZddddZ	ddddZ
dddd Zddd!d"d#Zdddd$d%d&Zddd'd(d)Zddd*d+d,d-Zddd.d/Zd
dd0d1Zd2d3 Zddd4d5Zd6dd7d8d9Zddd:d;d<d=Zdd6d>d?d@Zdd:ddAdBdCZdddd+dDdEZdddd+dFdGZdddd+dHdIZdddd+dJdKZdddd+dLdMZdddd+dNdOZdddd+dPdQZdddd+dRdSZdadddTdUdVZ dWdX Z!dddYdZZ"d[d\ Z#d6d*d]d^d_Z$dS )bSAS7BDATReadera  
    Read SAS files in SAS7BDAT format.

    Parameters
    ----------
    path_or_buf : path name or buffer
        Name of SAS file or file-like object pointing to SAS file
        contents.
    index : column identifier, defaults to None
        Column to use as index.
    convert_dates : bool, defaults to True
        Attempt to convert dates to Pandas datetime values.  Note that
        some rarely used SAS date formats may be unsupported.
    blank_missing : bool, defaults to True
        Convert empty strings to missing values (SAS uses blanks to
        indicate missing character variables).
    chunksize : int, defaults to None
        Return SAS7BDATReader object for iterations, returns chunks
        with given number of lines.
    encoding : string, defaults to None
        String encoding.
    convert_text : bool, defaults to True
        If False, text variables are left as raw bytes.
    convert_header_text : bool, defaults to True
        If False, header text, including column names, are left as raw
        bytes.
    r(   _int_lengthzbytes | None_cached_pageNTinferzFilePath | ReadBuffer[bytes]boolz
int | Nonez
str | Noner   r-   )	path_or_bufconvert_datesblank_missing	chunksizeencodingconvert_textconvert_header_textr+   r!   c
           
      C  s   || _ || _|| _|| _|| _|| _|| _d| _d| _g | _	g | _
g | _g | _g | _d | _g | _g | _g | _d| _d| _d| _t|dd|	d| _| jj| _z|   |   W n tk
r   |    Y nX d S )Nzlatin-1    r   rbF)Zis_textr+   )indexrD   rE   rF   rG   rH   rI   default_encodingr+   column_names_rawcolumn_namescolumn_formatscolumns%_current_page_data_subheader_pointersr@   _column_data_lengths_column_data_offsets_column_types_current_row_in_file_indexZ_current_row_on_page_indexr   handleshandle_path_or_buf_get_properties_parse_metadata	Exceptionclose)
r/   rC   rL   rD   rE   rF   rG   rH   rI   r+   r   r   r   r0      sD       
zSAS7BDATReader.__init__z
np.ndarray)r!   c                 C  s   t j| jt jdS )z5Return a numpy int64 array of the column data lengthsdtype)npasarrayrS   int64r/   r   r   r   column_data_lengths   s    z"SAS7BDATReader.column_data_lengthsc                 C  s   t j| jt jdS )z0Return a numpy int64 array of the column offsetsr^   )r`   ra   rT   rb   rc   r   r   r   column_data_offsets   s    z"SAS7BDATReader.column_data_offsetsc                 C  s   t j| jt ddS )zj
        Returns a numpy character array of the column types:
           s (string) or d (double)
        ZS1r^   )r`   ra   rU   r_   rc   r   r   r   column_types   s    zSAS7BDATReader.column_typesc                 C  s   | j   d S r.   )rW   r]   rc   r   r   r   r]      s    zSAS7BDATReader.closec                 C  s  | j d | j d| _| jdttj tjkr<tdd\}}| tj	tj
}|tjkrtj}d| _d| _tj| _tj| _nd| _tj| _tj| _d| _| tjtj}|tjkrtj}|| }| tjtj}|d	krd
| _nd| _| tjtjd }|tjkrtj| | _nd| d| _| tjtj }|dkrJd| _!n|dkr\d| _!nd| _!| "tj#tj$| _%| "tj&tj'| _(t)ddd}| *tj+| tj,}|t-j.|dd | _/| *tj0| tj1}|t-j.|dd | _2| 3tj4| tj5| _6| j | j6d }|  j|7  _t| j| j6kr2td| 3tj7| tj8| _9| 3tj:| tj;| _<| "tj=| tj>| _=| "tj?| tj@| _A| "tjB| tjC| _D| "tjE| tjF| _G| jGs| "tjH| tjI| _Gd S )Nr   i   z'magic number mismatch (not a SAS file?)r   r   T   F      <>zunknown (code=)   1unix   2Zwindowsunknownr   r   r   r#   z*The SAS7BDAT file appears to be truncated.)JrY   seekreadr@   lenconstmagicr   _read_bytesZalign_1_offsetZalign_1_lengthZu64_byte_checker_valueZalign_2_valueU64r?   Zpage_bit_offset_x64_page_bit_offsetZsubheader_pointer_length_x64_subheader_pointer_lengthZpage_bit_offset_x86Zsubheader_pointer_length_x86Zalign_2_offsetZalign_2_lengthZalign_1_checker_valueZendianness_offsetZendianness_length
byte_orderZencoding_offsetZencoding_lengthZencoding_namesfile_encodingZplatform_offsetZplatform_lengthplatform_read_and_convert_header_textZdataset_offsetZdataset_lengthr9   Zfile_type_offsetZfile_type_length	file_typer   _read_floatZdate_created_offsetZdate_created_lengthr   Zto_timedeltaZdate_createdZdate_modified_offsetZdate_modified_lengthZdate_modified	_read_intZheader_size_offsetZheader_size_lengthheader_lengthZpage_size_offsetZpage_size_length_page_lengthZpage_count_offsetZpage_count_lengthZ_page_countZsas_release_offsetZsas_release_lengthZsas_server_type_offsetZsas_server_type_lengthZserver_typeZos_version_number_offsetZos_version_number_length
os_versionZos_name_offsetZos_name_lengthos_nameZos_maker_offsetZos_maker_length)r/   Zalign1Zalign2bufZtotal_alignepochxr   r   r   rZ      s    




            zSAS7BDATReader._get_propertiesr   c                 C  s(   | j | jpdd}|jr$|   t|S )Nr   )nrows)rs   rF   emptyr]   StopIteration)r/   dar   r   r   __next__S  s
    zSAS7BDATReader.__next__)r)   widthc                 C  sJ   |dkr|    td| ||}|dkr0dnd}t| j| |d S )N)ri   rh   zinvalid float widthri   fr   r   r]   r   rw   structunpackr{   )r/   r)   r   r   fdr   r   r   r   [  s    zSAS7BDATReader._read_float)r)   r   r!   c                 C  sP   |dkr|    td| ||}ddddd| }t| j| |d }|S )N)r      ri   rh   zinvalid int widthbhlqr   r   )r/   r)   r   r   itZivr   r   r   r   d  s    zSAS7BDATReader._read_int)r)   r*   c                 C  s   | j d krX| j| | j|}t||k rT|   d|dd|dd}t||S || t| j krz|   td| j |||  S d S )NzUnable to read r   z bytes from file position .zThe cached page is too small.)r@   rY   rr   rs   rt   r]   r   )r/   r)   r*   r   msgr   r   r   rw   m  s    
zSAS7BDATReader._read_bytesr8   )r)   r*   r!   c                 C  s   |  | ||dS )N     )_convert_header_textrw   rstripr/   r)   r*   r   r   r   r~   |  s    z,SAS7BDATReader._read_and_convert_header_textc                 C  sN   d}|sJ| j | j| _t| jdkr(qJt| j| jkr@td|  }qd S )NFr   z2Failed to read a meta data page from the SAS file.)rY   rs   r   r@   rt   r   _process_page_meta)r/   doner   r   r   r[     s    zSAS7BDATReader._parse_metadatac                 C  sZ   |    tjtjtjg }| j|kr,|   | jtjk}| jtjk}t|pV|pV| j	g kS r.   )
_read_page_headerru   page_meta_typesZpage_amd_typepage_mix_type_current_page_type_process_page_metadatapage_data_typerB   rR   )r/   ptZis_data_pageZis_mix_pager   r   r   r     s    
z!SAS7BDATReader._process_page_metac                 C  s^   | j }tj| }| |tjtj@ | _tj| }| |tj| _	tj
| }| |tj| _d S r.   )ry   ru   Zpage_type_offsetr   Zpage_type_lengthZpage_type_mask2r   Zblock_count_offsetZblock_count_lengthZ_current_page_block_countZsubheader_count_offsetZsubheader_count_length_current_page_subheaders_count)r/   
bit_offsetZtxr   r   r   r     s    


 z SAS7BDATReader._read_page_headerc                 C  sp   | j }t| jD ]Z}| tj| |}|jdkr2q|jtjkr@q| 	|j
}| ||j|j}| || qd S )Nr   )ry   ranger   _process_subheader_pointersru   Zsubheader_pointers_offsetr*   r+   Ztruncated_subheader_id_read_subheader_signaturer)   _get_subheader_indexr,   _process_subheader)r/   r   ipointersubheader_signaturesubheader_indexr   r   r   r     s"     
  z%SAS7BDATReader._process_page_metadatar<   )	signaturer!   c                 C  s`   t j|}|d kr\|t jkp$|dk}|t jk}| jdkrL|rL|rLt jj}n|   t	d|S )Nr   rJ   zUnknown subheader signature)
ru   Zsubheader_signature_to_indexgetZcompressed_subheader_idZcompressed_subheader_typer+   SASIndexdata_subheader_indexr]   r   )r/   r   r+   r,   rL   f1f2r   r   r   r     s    

z#SAS7BDATReader._get_subheader_indexr'   )r)   subheader_pointer_indexr!   c           
      C  st   | j }|||  }| || j}|| j7 }| || j}|| j7 }| |d}|d7 }| |d}t||||}	|	S )Nr   )rz   r   r?   r'   )
r/   r)   r   Zsubheader_pointer_lengthZtotal_offsetZsubheader_offsetZsubheader_lengthZsubheader_compressionZsubheader_typer   r   r   r   r     s     

   z*SAS7BDATReader._process_subheader_pointers)r)   r!   c                 C  s   |  || j}|S r.   )rw   r?   )r/   r)   r   r   r   r   r     s    z(SAS7BDATReader._read_subheader_signature)r   r   r!   c                 C  s   |j }|j}|tjjkr | j}n|tjjkr4| j}n|tjjkrH| j	}n|tjj
kr\| j}nt|tjjkrp| j}n`|tjjkr| j}nL|tjjkr| j}n8|tjjkr| j}n$|tjjkr| j| d S td||| d S )Nzunknown subheader index)r)   r*   ru   r   Zrow_size_index_process_rowsize_subheaderZcolumn_size_index_process_columnsize_subheaderZcolumn_text_index_process_columntext_subheaderZcolumn_name_index_process_columnname_subheaderZcolumn_attributes_index#_process_columnattributes_subheaderZformat_and_label_index_process_format_subheaderZcolumn_list_index_process_columnlist_subheaderZsubheader_counts_index_process_subheader_countsr   rR   appendr   )r/   r   r   r)   r*   	processorr   r   r   r     s.    z!SAS7BDATReader._process_subheaderc                 C  s   | j }|}|}| jr&|d7 }|d7 }n|d7 }|d7 }| |tj|  || _| |tj|  || _| |tj|  || _	| |tj
|  || _tj| }| || || _| |d| _| |d| _d S )Ni  i  ib  iz  r   )r?   rx   r   ru   Zrow_length_offset_multiplierZ
row_lengthZrow_count_offset_multiplier	row_countZcol_count_p1_multipliercol_count_p1Zcol_count_p2_multipliercol_count_p2Z'row_count_on_mix_page_offset_multiplierZ_mix_page_row_count_lcs_lcp)r/   r)   r*   int_lenZ
lcs_offsetZ
lcp_offsetZmxr   r   r   r     s8    
    
z)SAS7BDATReader._process_rowsize_subheaderc                 C  sT   | j }||7 }| ||| _| j| j | jkrPtd| j d| j d| j d d S )Nz Warning: column count mismatch (z + z != z)
)r?   r   column_countr   r   print)r/   r)   r*   r   r   r   r   r     s    z,SAS7BDATReader._process_columnsize_subheaderc                 C  s   d S r.   r   r   r   r   r   r   %  s    z(SAS7BDATReader._process_subheader_countsc           	      C  s  || j 7 }| |tj}| ||}|d| d}| j| t| jdkrd}tj	D ]}||kr\|}q\|| _
|| j 8 }|d }| jr|d7 }| || j}|d}|dkrd| _|d }| jr|d7 }| || j}|d| j | _n|tjkr4|d	 }| jr|d7 }| || j}|d| j | _nH| jdkr|d| _|d }| jr^|d7 }| || j}|d| j | _t| d
r| | j| _d S )Nr   r   r   rJ      ri           (   creator_proc)r?   r   ru   Ztext_block_size_lengthrw   r   rN   r   rt   Zcompression_literalsr+   rx   r   r   r   Zrle_compressionhasattrr   )	r/   r)   r*   Ztext_block_sizer   Z	cname_rawZcompression_literalZclZoffset1r   r   r   r   (  sN    



z,SAS7BDATReader._process_columntext_subheaderc                 C  s   | j }||7 }|d|  d d }t|D ]}|tj|d   tj }|tj|d   tj }|tj|d   tj }| |tj}	| |tj	}
| |tj
}| j|	 }||
|
|  }| j| | q*d S )Nr      rh   r   )r?   r   ru   Zcolumn_name_pointer_lengthZ!column_name_text_subheader_offsetZcolumn_name_offset_offsetZcolumn_name_length_offsetr   Z!column_name_text_subheader_lengthZcolumn_name_offset_lengthZcolumn_name_length_lengthrN   rO   r   r   )r/   r)   r*   r   Zcolumn_name_pointers_countr   Ztext_subheaderZcol_name_offsetZcol_name_lengthidx
col_offsetZcol_lenZname_rawcnamer   r   r   r   V  sD      
z,SAS7BDATReader._process_columnname_subheaderc           
      C  s   | j }|d|  d |d  }t|D ]}|| tj ||d   }|d|  tj ||d   }|d|  tj ||d   }| ||}	| j|	 | |tj	}	| j
|	 | |tj}	| j|	dkrdnd q&d S )Nr   r   rh   r      d   s)r?   r   ru   Zcolumn_data_offset_offsetZcolumn_data_length_offsetZcolumn_type_offsetr   rT   r   Zcolumn_data_length_lengthrS   Zcolumn_type_lengthrU   )
r/   r)   r*   r   Zcolumn_attributes_vectors_countr   Zcol_data_offsetZcol_data_lenZ	col_typesr   r   r   r   r   w  s*    
z2SAS7BDATReader._process_columnattributes_subheaderc                 C  s   d S r.   r   r   r   r   r   r     s    z,SAS7BDATReader._process_columnlist_subheaderc                 C  sx  | j }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }	| |tj	}
t
|
t| jd }| |tj}| |tj}| |tj}t
|t| jd }| |tj}| |	tj}| j| }| ||||  }| j| }| ||||  }t| j}t|| j| ||| j| | j| }| j| | j| d S )N   r   )r?   ru   Z)column_format_text_subheader_index_offsetZcolumn_format_offset_offsetZcolumn_format_length_offsetZ(column_label_text_subheader_index_offsetZcolumn_label_offset_offsetZcolumn_label_length_offsetr   Z)column_format_text_subheader_index_lengthminrt   rN   Zcolumn_format_offset_lengthZcolumn_format_length_lengthZ(column_label_text_subheader_index_lengthZcolumn_label_offset_lengthZcolumn_label_length_lengthr   rQ   r6   rO   rU   rS   rP   r   )r/   r)   r*   r   Ztext_subheader_formatZcol_format_offsetZcol_format_lenZtext_subheader_labelZcol_label_offsetZcol_label_lenr   Z
format_idxZformat_startZ
format_lenZ	label_idxZlabel_startZ	label_lenZlabel_namesZcolumn_labelZformat_namesZcolumn_formatZcurrent_column_numbercolr   r   r   r     sZ       


	z(SAS7BDATReader._process_format_subheader)r   r!   c                 C  s   |d kr| j d k	r| j }n|d kr(| j}t| jdkrF|   td|dkr`| j| jkr`t S | j| j }||krx|}| jd}| jd}t	j
||ftd| _t	j|d| ft	jd| _d| _t| }|| |  }| jd k	r|| j}|S )Nr   zNo columns to parse from filer   r   r^   rh   )rF   r   rt   rU   r]   r
   rV   r   countr`   r   object_string_chunkzerosZuint8_byte_chunk_current_row_in_chunk_indexr   rs   _chunk_to_dataframerL   Z	set_index)r/   r   mZndnsprsltr   r   r   rs     s.    

zSAS7BDATReader.readc                 C  s   g | _ | j| j| _t| jdkr(dS t| j| jkrf|   dt| jdd| jdd}t||   | j	t
jkr|   | j	t
jt
jt
jg kr|  S dS )Nr   Tz-failed to read complete page from file (read r   z of z bytes)F)rR   rY   rs   r   r@   rt   r]   r   r   r   ru   r   r   r   r   _read_next_page)r/   r   r   r   r   r     s$    zSAS7BDATReader._read_next_pagec                 C  s  | j }| j}t|| |}i }d\}}t| jD ]R}| j| }| j| dkr| j|d d f j| jd d}	t	j
|	tj|d||< | jr| j| tjkrt|| d||< n"| j| tjkrt|| d||< |d7 }q0| j| dkrdt	j
| j|d d f |d	||< | jr2| jd k	r2| || j||< | jrZ|| j d
k}
tj|| |
< |d7 }q0|   tdt| j|  q0t|| j|dd}|S )Nrg   r   r   r^   )r_   rL   r   r   r   )rL   r   zunknown column type F)rQ   rL   copy)r   rV   r   r   rO   rU   r   viewr{   r   r%   r`   Zfloat64rD   rP   ru   Zsas_date_formatsr&   Zsas_datetime_formatsr   rH   rG   _decode_stringr   rE   rt   nanr]   r   reprr   )r/   nr   ixr   ZjsZjbjr9   Zcol_arriiZdfr   r   r   r     s8    
 
 
z"SAS7BDATReader._chunk_to_dataframec                 C  s   | | jp| jS r.   )decoderG   rM   r/   r   r   r   r   r   )  s    zSAS7BDATReader._decode_string)r   r!   c                 C  s   | j r| |S |S d S r.   )rI   r   r   r   r   r   r   ,  s    
z#SAS7BDATReader._convert_header_text)NTTNNTTrA   )N)%r2   r3   r4   __doc__r5   r0   rd   re   rf   r]   rZ   r   r   r   rw   r~   r[   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rs   r   r   r   r   r   r   r   r   r>      sV   
         3l		
.!5"$r>   )(r   
__future__r   collectionsr   r   r   r   typingr   Znumpyr`   Zpandas._typingr   r   r	   Zpandas.errorsr
   r   Zpandasr   r   r   Zpandas.io.commonr   Zpandas.io.sas._sasr   Zpandas.io.sas.sas_constantsioZsasZsas_constantsru   Zpandas.io.sas.sasreaderr   r   r&   r'   r6   Iteratorr>   r   r   r   r   <module>   s&   